def test_preprocess_remove_empty_lines(self): random_basedir_name = self.get_random_basename() os.mkdir(random_basedir_name) with open(os.sep.join([random_basedir_name, 'sample-corpus.en']), 'w') as f: f.write('one' + '\n') f.write('\n') # must be removed f.write('one two three' + '\n') f.write('one two' + '\n') f.write('one' + '\n') # must be removed (because .fr is empty) f.write('one two' + '\n') with open(os.sep.join([random_basedir_name, 'sample-corpus.fr']), 'w') as f: f.write('one' + '\n') f.write('\n') # must be removed f.write('one two three' + '\n') f.write('one two' + '\n') f.write('\n') # must be removed f.write('one two' + '\n') # changed order of args t = TrainingMoses(random_basedir_name, "en", "fr", SELFCASING, None, None, None, XML_PASS_THROUGH) # changed order of args t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), min_tokens=1, max_tokens=80, preprocess_external=False, mask=False, process_xml=False) self.assertIs( self.count_lines(os.sep.join([random_basedir_name, 'corpus', 'train.en'])), 4, # only one line satisfies max_tokens for both en and fr "Bi-segments where src and/or trg are empty lines must be removed" ) self.assertIs( self.count_lines(os.sep.join([random_basedir_name, 'corpus', 'train.fr'])), 4, # only one line satisfies max_tokens for both en and fr "Bi-segments where src and/or trg are empty lines must be removed" )
def test_preprocess_max_tokens(self): random_basedir_name = self.get_random_basename() os.mkdir(random_basedir_name) with open(os.sep.join([random_basedir_name, 'sample-corpus.en']), 'w') as f: f.write('one' + '\n') f.write('one two' + '\n') f.write('one two three' + '\n') with open(os.sep.join([random_basedir_name, 'sample-corpus.fr']), 'w') as f: f.write('one two three' + '\n') f.write('one two' + '\n') f.write('one' + '\n') # changed order of args t = TrainingMoses(random_basedir_name, "en", "fr", SELFCASING, None, None, None, XML_PASS_THROUGH) # changed order of args t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), min_tokens=1, max_tokens=2, preprocess_external=False, mask=False, process_xml=False) self.assertIs( self.count_lines(os.sep.join([random_basedir_name, 'corpus', 'train.en'])), 1, # only one line satisfies max_tokens for both en and fr "There must be no segment with less than min_tokens" ) self.assertIs( self.count_lines(os.sep.join([random_basedir_name, 'corpus', 'train.fr'])), 1, # only one line satisfies max_tokens for both en and fr "There must be no segment with less than min_tokens" )
def test_preprocess_external_eval_corpus(self): random_basedir_name = self.get_random_basename() os.mkdir(random_basedir_name) # changed order of args, made mask&xml explicit to fix keyword/positional args order t = TrainingMoses( random_basedir_name, "en", "fr", SELFCASING, tuning=None, evaluation=self._basedir_test_cases + os.sep + "external-sample-corpus", masking_strategy=None, xml_strategy=XML_PASS_THROUGH ) # create sample base corpus self._create_random_parallel_corpus_files( path=random_basedir_name, filename_source="sample-corpus.en", filename_target="sample-corpus.fr", num_bisegments=200 ) # create sample external eval corpus self._create_random_parallel_corpus_files( path=self._basedir_test_cases, filename_source="external-sample-corpus.en", filename_target="external-sample-corpus.fr", num_bisegments=50 ) # changed order of args t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), 1, 80, True, False, False) self.assertTrue( assertions.file_exists(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".en"), "Source side of external evaluation corpus must be created" ) self.assertTrue( assertions.file_exists(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".fr"), "Target side of external evaluation corpus must be created" ) self.assertTrue( self.count_lines(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".en"), "Number of segments in source side of external evaluation corpus must be correct" ) self.assertTrue( self.count_lines(random_basedir_name + os.sep + "corpus" + os.sep + BASENAME_EVALUATION_CORPUS + ".fr"), "Number of segments in target side of external evaluation corpus must be correct" )
def test_preprocess_base_corpus_correct_number_of_lines_train_only(self): random_basedir_name = self.get_random_basename() os.mkdir(random_basedir_name) # changed order of args t = TrainingMoses(random_basedir_name, "en", "fr", SELFCASING, None, None, None, XML_PASS_THROUGH) self._create_random_parallel_corpus_files( path=random_basedir_name, filename_source="sample-corpus.en", filename_target="sample-corpus.fr", num_bisegments=200 ) t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), 1, 80, False, False, False) self.assertTrue( 200 == self.count_lines(os.sep.join([random_basedir_name, "corpus", BASENAME_TRAINING_CORPUS + ".en"])), "Number of segments in source side of training corpus must be correct" ) self.assertTrue( 200 == self.count_lines(os.sep.join([random_basedir_name, "corpus", BASENAME_TRAINING_CORPUS + ".fr"])), "Number of segments in target side of training corpus must be correct" )
def test_preprocess_base_corpus_file_creation_train_only(self): random_basedir_name = self.get_random_basename() os.mkdir(random_basedir_name) # changed order of args t = TrainingMoses(random_basedir_name, "en", "fr", SELFCASING, None, None, None, XML_PASS_THROUGH) self._create_random_parallel_corpus_files( path=random_basedir_name, filename_source="sample-corpus.en", filename_target="sample-corpus.fr", num_bisegments=200 ) t.preprocess(os.sep.join([random_basedir_name, "sample-corpus"]), 1, 80, False, False, False) files_created = os.listdir(os.sep.join([random_basedir_name, "corpus"])) self.assertTrue( BASENAME_TRAINING_CORPUS + ".en" in files_created, "Training corpus for source language must be created" ) self.assertTrue( BASENAME_TRAINING_CORPUS + ".fr" in files_created, "Training corpus for target language must be created" )