Beispiel #1
0
 def test_preprocess_remove_empty_lines(self):
     random_basedir_name = self.get_random_basename()
     os.mkdir(random_basedir_name)
     with open(os.sep.join([random_basedir_name, 'sample-corpus.en']), 'w') as f:
         f.write('one' + '\n')
         f.write('\n') # must be removed
         f.write('one two three' + '\n')
         f.write('one two' + '\n')
         f.write('one' + '\n') # must be removed (because .fr is empty)
         f.write('one two' + '\n')
     with open(os.sep.join([random_basedir_name, 'sample-corpus.fr']), 'w') as f:
         f.write('one' + '\n')
         f.write('\n') # must be removed
         f.write('one two three' + '\n')
         f.write('one two' + '\n')
         f.write('\n') # must be removed
         f.write('one two' + '\n')
     # changed order of args
     t = TrainingMoses(random_basedir_name, "en", "fr", SELFCASING, None, None, None, XML_PASS_THROUGH)
     # changed order of args
     t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]),
         min_tokens=1, max_tokens=80, preprocess_external=False, mask=False, process_xml=False)
     self.assertIs(
         self.count_lines(os.sep.join([random_basedir_name, 'corpus', 'train.en'])),
         4, # only one line satisfies max_tokens for both en and fr
         "Bi-segments where src and/or trg are empty lines must be removed"
     )
     self.assertIs(
         self.count_lines(os.sep.join([random_basedir_name, 'corpus', 'train.fr'])),
         4, # only one line satisfies max_tokens for both en and fr
         "Bi-segments where src and/or trg are empty lines must be removed"
     )
Beispiel #2
0
 def test_preprocess_max_tokens(self):
     random_basedir_name = self.get_random_basename()
     os.mkdir(random_basedir_name)
     with open(os.sep.join([random_basedir_name, 'sample-corpus.en']), 'w') as f:
         f.write('one' + '\n')
         f.write('one two' + '\n')
         f.write('one two three' + '\n')
     with open(os.sep.join([random_basedir_name, 'sample-corpus.fr']), 'w') as f:
         f.write('one two three' + '\n')
         f.write('one two' + '\n')
         f.write('one' + '\n')
     # changed order of args
     t = TrainingMoses(random_basedir_name, "en", "fr", SELFCASING, None, None, None, XML_PASS_THROUGH)
     # changed order of args
     t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]),
         min_tokens=1, max_tokens=2, preprocess_external=False, mask=False, process_xml=False)
     self.assertIs(
         self.count_lines(os.sep.join([random_basedir_name, 'corpus', 'train.en'])),
         1, # only one line satisfies max_tokens for both en and fr
         "There must be no segment with less than min_tokens"
     )
     self.assertIs(
         self.count_lines(os.sep.join([random_basedir_name, 'corpus', 'train.fr'])),
         1, # only one line satisfies max_tokens for both en and fr
         "There must be no segment with less than min_tokens"
     )
Beispiel #3
0
 def test_preprocess_external_eval_corpus(self):
     random_basedir_name = self.get_random_basename()
     os.mkdir(random_basedir_name)
     # changed order of args, made mask&xml explicit to fix keyword/positional args order
     t = TrainingMoses(
         random_basedir_name, "en", "fr", SELFCASING,
         tuning=None,
         evaluation=self._basedir_test_cases + os.sep + "external-sample-corpus",
         masking_strategy=None, xml_strategy=XML_PASS_THROUGH
     )
     # create sample base corpus
     self._create_random_parallel_corpus_files(
         path=random_basedir_name,
         filename_source="sample-corpus.en",
         filename_target="sample-corpus.fr",
         num_bisegments=200
     )
     # create sample external eval corpus
     self._create_random_parallel_corpus_files(
         path=self._basedir_test_cases,
         filename_source="external-sample-corpus.en",
         filename_target="external-sample-corpus.fr",
         num_bisegments=50
     )
     # changed order of args
     t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), 1, 80, True, False, False)
     self.assertTrue(
         assertions.file_exists(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".en"),
         "Source side of external evaluation corpus must be created"
     )
     self.assertTrue(
         assertions.file_exists(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".fr"),
         "Target side of external evaluation corpus must be created"
     )
     self.assertTrue(
         self.count_lines(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".en"),
         "Number of segments in source side of external evaluation corpus must be correct"
     )
     self.assertTrue(
         self.count_lines(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".fr"),
         "Number of segments in target side of external evaluation corpus must be correct"
     )
Beispiel #4
0
 def test_preprocess_base_corpus_correct_number_of_lines_train_only(self):
     random_basedir_name = self.get_random_basename()
     os.mkdir(random_basedir_name)
     # changed order of args
     t = TrainingMoses(random_basedir_name, "en", "fr", SELFCASING, None, None, None, XML_PASS_THROUGH)
     self._create_random_parallel_corpus_files(
         path=random_basedir_name,
         filename_source="sample-corpus.en",
         filename_target="sample-corpus.fr",
         num_bisegments=200
     )
     t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), 1, 80, False, False, False)
     self.assertTrue(
         200 == self.count_lines(os.sep.join([random_basedir_name, "corpus", BASENAME_TRAINING_CORPUS + ".en"])),
         "Number of segments in source side of training corpus must be correct"
     )
     self.assertTrue(
         200 == self.count_lines(os.sep.join([random_basedir_name, "corpus", BASENAME_TRAINING_CORPUS + ".fr"])),
         "Number of segments in target side of training corpus must be correct"
     )
Beispiel #5
0
 def test_preprocess_base_corpus_file_creation_train_only(self):
     random_basedir_name = self.get_random_basename()
     os.mkdir(random_basedir_name)
     # changed order of args
     t = TrainingMoses(random_basedir_name, "en", "fr", SELFCASING, None, None, None, XML_PASS_THROUGH)
     self._create_random_parallel_corpus_files(
         path=random_basedir_name,
         filename_source="sample-corpus.en",
         filename_target="sample-corpus.fr",
         num_bisegments=200
     )
     t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), 1, 80, False, False, False)
     files_created = os.listdir(os.sep.join([random_basedir_name, "corpus"]))
     self.assertTrue(
         BASENAME_TRAINING_CORPUS + ".en" in files_created,
         "Training corpus for source language must be created"
     )
     self.assertTrue(
         BASENAME_TRAINING_CORPUS + ".fr" in files_created,
         "Training corpus for target language must be created"
     )