Esempio n. 1
0
 def test_corpus_creation(self):
     corpus = self._create_test_corpus()
     filepath_source, filepath_target = corpus.get_filepaths()
     self.assertTrue(assertions.file_exists(filepath_source),
                     "File for source side of parallel corpus not created")
     self.assertTrue(assertions.file_exists(filepath_target),
                     "File for target side of parallel corpus not created")
     corpus.close()
Esempio n. 2
0
 def test_preprocess_external_eval_corpus(self):
     random_basedir_name = self.get_random_basename()
     os.mkdir(random_basedir_name)
     # changed order of args, made mask&xml explicit to fix keyword/positional args order
     t = TrainingMoses(
         random_basedir_name, "en", "fr", SELFCASING,
         tuning=None,
         evaluation=self._basedir_test_cases + os.sep + "external-sample-corpus",
         masking_strategy=None, xml_strategy=XML_PASS_THROUGH
     )
     # create sample base corpus
     self._create_random_parallel_corpus_files(
         path=random_basedir_name,
         filename_source="sample-corpus.en",
         filename_target="sample-corpus.fr",
         num_bisegments=200
     )
     # create sample external eval corpus
     self._create_random_parallel_corpus_files(
         path=self._basedir_test_cases,
         filename_source="external-sample-corpus.en",
         filename_target="external-sample-corpus.fr",
         num_bisegments=50
     )
     # changed order of args
     t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), 1, 80, True, False, False)
     self.assertTrue(
         assertions.file_exists(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".en"),
         "Source side of external evaluation corpus must be created"
     )
     self.assertTrue(
         assertions.file_exists(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".fr"),
         "Target side of external evaluation corpus must be created"
     )
     self.assertTrue(
         self.count_lines(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".en"),
         "Number of segments in source side of external evaluation corpus must be correct"
     )
     self.assertTrue(
         self.count_lines(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".fr"),
         "Number of segments in target side of external evaluation corpus must be correct"
     )
Esempio n. 3
0
    def test_preprocess_external_eval_corpus(self):
        random_basedir_name = self.get_random_basename()
        os.mkdir(random_basedir_name)

        t = TrainingNematus(
            random_basedir_name, "en", "fr", TRUECASING,
            tuning=None,
            evaluation=self._basedir_test_cases + os.sep + "external-sample-corpus"
        )
        # create sample base corpus
        self._create_random_parallel_corpus_files(
            path=random_basedir_name,
            filename_source="sample-corpus.en",
            filename_target="sample-corpus.fr",
            num_bisegments=200
        )
        # create sample external eval corpus
        self._create_random_parallel_corpus_files(
            path=self._basedir_test_cases,
            filename_source="external-sample-corpus.en",
            filename_target="external-sample-corpus.fr",
            num_bisegments=50
        )

        t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), 1, 80, True)
        self.assertTrue(
            assertions.file_exists(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".en"),
            "Source side of external evaluation corpus must be created"
        )
        self.assertTrue(
            assertions.file_exists(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".fr"),
            "Target side of external evaluation corpus must be created"
        )
        self.assertTrue(
            self.count_lines(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".en"),
            "Number of segments in source side of external evaluation corpus must be correct"
        )
        self.assertTrue(
            self.count_lines(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".fr"),
            "Number of segments in target side of external evaluation corpus must be correct"
        )