def setUp(self): self.src_txt, self.trg_txt = test_utils.create_test_text_files() self.vocab_file_path = test_utils.make_temp_file() self.d = dictionary.Dictionary.build_vocab_file( corpus_files=[self.src_txt, self.trg_txt], vocab_file=self.vocab_file_path, max_vocab_size=0, padding_factor=1, # don't add extra padding symbols ) # src_ref is reversed self.src_ref = [ [106, 104, 102, 100], [104, 104, 102, 102, 100, 100], [102, 102, 102, 102, 100, 100, 100, 100], [100, 100, 100, 100, 100, 100, 100, 100, 100, 100], ] self.trg_ref = [ [101, 101, 101, 101, 101, 101, 101, 101, 101, 101], [101, 101, 101, 101, 103, 103, 103, 103], [101, 101, 103, 103, 105, 105], [101, 103, 105, 107], ] self.src_txt_numberized, self.trg_txt_numberized = test_utils.create_test_numberized_data_files( self.src_ref, self.trg_ref, reverse_source=True ) self.num_sentences = 4
def test_load_data_single_path(self): test_args = test_utils.ModelParamsDict() test_args.source_lang = "en" test_args.target_lang = "fr" test_args.log_verbose = False src_dict, tgt_dict = test_utils.create_vocab_dictionaries() src_text_file, tgt_text_file = test_utils.create_test_text_files() src_bin_path = preprocess.binarize_text_file( text_file=src_text_file, dictionary=src_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) tgt_bin_path = preprocess.binarize_text_file( text_file=tgt_text_file, dictionary=tgt_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict) split = "0" task.load_dataset(split, src_bin_path, tgt_bin_path) self.assertEqual(len(task.datasets[split]), 4) self.assertIsInstance(task.datasets[split], LanguagePairDataset)
def test_load_data_multi_path(self): test_args = test_utils.ModelParamsDict() test_args.source_lang = "en" test_args.target_lang = "fr" test_args.log_verbose = False src_dict, tgt_dict = test_utils.create_vocab_dictionaries() num_paths = 4 src_bin_path, tgt_bin_path = {}, {} for i in range(num_paths): src_text_file, tgt_text_file = test_utils.create_test_text_files() src_bin_path[i] = preprocess.binarize_text_file( text_file=src_text_file, dictionary=src_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) tgt_bin_path[i] = preprocess.binarize_text_file( text_file=tgt_text_file, dictionary=tgt_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict) split = "1" task.load_dataset(split, src_bin_path, tgt_bin_path) self.assertEqual(len(task.datasets[split]), 16) self.assertIsInstance(task.datasets[split], MultiCorpusSampledDataset)
def _prepare_data_multi_path(self, num_paths): test_args = test_utils.ModelParamsDict() test_args.source_lang = "en" test_args.target_lang = "fr" test_args.log_verbose = False test_args.dataset_upsampling = None test_args.dataset_relative_ratio = None src_dict, tgt_dict = test_utils.create_vocab_dictionaries() src_bin_path, tgt_bin_path = {}, {} for i in range(num_paths): src_text_file, tgt_text_file = test_utils.create_test_text_files() src_bin_path[i] = preprocess.binarize_text_file( text_file=src_text_file, dictionary=src_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) tgt_bin_path[i] = preprocess.binarize_text_file( text_file=tgt_text_file, dictionary=tgt_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) return test_args, src_dict, tgt_dict, src_bin_path, tgt_bin_path
def setUp(self): self.src_txt, self.trg_txt = test_utils.create_test_text_files() self.vocab_file_path = test_utils.make_temp_file() self.d = dictionary.Dictionary.build_vocab_file( corpus_files=[self.src_txt, self.trg_txt], vocab_file=self.vocab_file_path, max_vocab_size=0, ) # src_ref is reversed, +1 for lua self.src_ref = [ [107, 105, 103, 101], [105, 105, 103, 103, 101, 101], [103, 103, 103, 103, 101, 101, 101, 101], [101, 101, 101, 101, 101, 101, 101, 101, 101, 101], ] self.trg_ref = [ [102, 102, 102, 102, 102, 102, 102, 102, 102, 102], [102, 102, 102, 102, 104, 104, 104, 104], [102, 102, 104, 104, 106, 106], [102, 104, 106, 108], ] self.src_txt_numberized, self.trg_txt_numberized = test_utils.create_test_numberized_data_files( self.src_ref, self.trg_ref, reverse_source=True) self.lua_eos = self.d.eos_index + 1 self.num_sentences = 4
def test_build_vocab_file_max_vocab(self): src_txt, trg_txt = test_utils.create_test_text_files() tmp_prefix = test_utils.make_temp_file() src_dict1 = dictionary.Dictionary.build_vocab_file( corpus_files=[src_txt], vocab_file=f"{tmp_prefix}.src1", max_vocab_size=1) src_dict2 = dictionary.Dictionary.build_vocab_file( corpus_files=[src_txt], vocab_file=f"{tmp_prefix}.src2", max_vocab_size=2) src_dict3 = dictionary.Dictionary.build_vocab_file( corpus_files=[src_txt], vocab_file=f"{tmp_prefix}.src3", max_vocab_size=104) src_dict4 = dictionary.Dictionary.build_vocab_file( corpus_files=[src_txt], vocab_file=f"{tmp_prefix}.src4", max_vocab_size=0) self.assertEqual(src_dict1.nspecial + 1, len(src_dict1)) self.assertEqual(src_dict2.nspecial + 2, len(src_dict2)) self.assertEqual(src_dict3.nspecial + 4, len(src_dict3)) self._assert_vocab_equal(src_dict3, src_dict4) os.remove(f"{tmp_prefix}.src1") os.remove(f"{tmp_prefix}.src2") os.remove(f"{tmp_prefix}.src3") os.remove(f"{tmp_prefix}.src4") os.remove(src_txt) os.remove(trg_txt)
def test_build_vocab_file(self): src_txt, trg_txt = test_utils.create_test_text_files() tmp_prefix = test_utils.make_temp_file() src_dict1 = dictionary.Dictionary.build_vocab_file( corpus_files=[src_txt], vocab_file=f"{tmp_prefix}.src1", max_vocab_size=1000) src_dict2 = dictionary.Dictionary.build_vocab_file( corpus_files=[src_txt, src_txt, src_txt], vocab_file=f"{tmp_prefix}.src2", max_vocab_size=1000, padding_factor=1, ) trg_dict1 = dictionary.Dictionary.build_vocab_file( corpus_files=[trg_txt], vocab_file=f"{tmp_prefix}.trg1", max_vocab_size=1000) trg_dict2 = dictionary.Dictionary.build_vocab_file( corpus_files=[trg_txt, trg_txt, trg_txt], vocab_file=f"{tmp_prefix}.trg2", max_vocab_size=1000, padding_factor=1, ) srctrg_dict = dictionary.Dictionary.build_vocab_file( corpus_files=[src_txt, trg_txt], vocab_file=f"{tmp_prefix}.srctrg", max_vocab_size=1000, padding_factor=1, ) nspecial = src_dict1.nspecial self.assertEqual(len(src_dict1), nspecial + 4) self.assertEqual(len(trg_dict1), nspecial + 4) self.assertEqual(len(srctrg_dict), nspecial + 8) for s in src_dict1.symbols: self.assertIn(s, srctrg_dict.symbols) for s in trg_dict1.symbols: self.assertIn(s, srctrg_dict.symbols) src_dict1_loaded = dictionary.Dictionary.load(f"{tmp_prefix}.src1") src_dict2_loaded = dictionary.Dictionary.load(f"{tmp_prefix}.src2") trg_dict1_loaded = dictionary.Dictionary.load(f"{tmp_prefix}.trg1") trg_dict2_loaded = dictionary.Dictionary.load(f"{tmp_prefix}.trg2") self._assert_vocab_equal(src_dict1, src_dict2) self._assert_vocab_equal(src_dict1, src_dict1_loaded) self._assert_vocab_equal(src_dict1, src_dict2_loaded) self._assert_vocab_equal(trg_dict1, trg_dict2) self._assert_vocab_equal(trg_dict1, trg_dict1_loaded) self._assert_vocab_equal(trg_dict1, trg_dict2_loaded) for c in range(nspecial, nspecial + 4): self.assertEqual(src_dict1.count[c], src_dict1_loaded.count[c]) self.assertEqual(src_dict2.count[c], src_dict2_loaded.count[c]) self.assertEqual(src_dict1.count[c] * 3, src_dict2.count[c]) self.assertEqual(trg_dict1.count[c], trg_dict1_loaded.count[c]) self.assertEqual(trg_dict2.count[c], trg_dict2_loaded.count[c]) self.assertEqual(trg_dict1.count[c] * 3, trg_dict2.count[c]) os.remove(f"{tmp_prefix}.src1") os.remove(f"{tmp_prefix}.src2") os.remove(f"{tmp_prefix}.trg1") os.remove(f"{tmp_prefix}.trg2") os.remove(src_txt) os.remove(trg_txt)
def test_load_data_single_path_idx_bin(self): test_args = test_utils.ModelParamsDict() test_args.source_lang = "en" test_args.target_lang = "fr" test_args.log_verbose = False src_dict, tgt_dict = test_utils.create_vocab_dictionaries() src_text_file, tgt_text_file = test_utils.create_test_text_files() task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict) with tempfile.TemporaryDirectory() as destdir: preprocess_args = [ "--source-lang", test_args.source_lang, "--target-lang", test_args.target_lang, "--destdir", destdir, ] preproc_parser = preprocess_options.get_preprocessing_parser() preproc_args = preproc_parser.parse_args(preprocess_args) preproc_args.dataset_impl = "mmap" split = "train" binarize( preproc_args, src_text_file, src_dict, split, test_args.source_lang, offset=0, end=-1, ) binarize( preproc_args, tgt_text_file, tgt_dict, split, test_args.target_lang, offset=0, end=-1, ) src_path = dataset_dest_prefix(preproc_args, split, test_args.source_lang) tgt_path = dataset_dest_prefix(preproc_args, split, test_args.target_lang) task.load_dataset(split, src_path, tgt_path, is_npz=False) self.assertEqual(len(task.datasets[split]), 4) self.assertIsInstance(task.datasets[split], LanguagePairDataset)
def test_load_data_noising(self): test_args = test_utils.ModelParamsDict() test_args.source_lang = "en" test_args.target_lang = "fr" test_args.log_verbose = False src_dict, tgt_dict = test_utils.create_vocab_dictionaries() num_paths = 4 src_bin_path, tgt_bin_path = {}, {} for i in range(num_paths): src_text_file, tgt_text_file = test_utils.create_test_text_files() src_bin_path[i] = preprocess.binarize_text_file( text_file=src_text_file, dictionary=src_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) tgt_bin_path[i] = preprocess.binarize_text_file( text_file=tgt_text_file, dictionary=tgt_dict, output_path=tempfile.NamedTemporaryFile().name, append_eos=True, reverse_order=False, ) task = tasks.PytorchTranslateTask(test_args, src_dict, tgt_dict) split = "1" task.load_dataset( split, src_bin_path, tgt_bin_path, noiser={ 0: UnsupervisedMTNoising( dictionary=src_dict, max_word_shuffle_distance=3, word_dropout_prob=0.2, word_blanking_prob=0.2, ) }, ) self.assertEqual(len(task.datasets[split]), 16) self.assertIsInstance(task.datasets[split].datasets[0].src, NoisingDataset)
def setUp(self): self.src_txt, self.trg_txt = test_utils.create_test_text_files() self.vocab_file_path = test_utils.make_temp_file() self.word_dict = Dictionary.build_vocab_file( corpus_files=[self.src_txt, self.trg_txt], vocab_file=self.vocab_file_path, max_vocab_size=0, padding_factor=1, # don't add extra padding symbols ) self.char_dict = Dictionary.build_vocab_file( corpus_files=[self.src_txt, self.trg_txt], vocab_file=self.vocab_file_path, max_vocab_size=0, is_char_vocab=True, padding_factor=1, # don't add extra padding symbols ) self.sample = self._dummy_char_data_sample( src_dict=self.word_dict, dst_dict=self.word_dict, src_char_dict=self.char_dict, dst_char_dict=self.char_dict, )
def test_push(self): max_vocab_dict = dictionary.MaxVocabDictionary() src_txt, trg_txt = test_utils.create_test_text_files() tmp_prefix = test_utils.make_temp_file() src_dict = dictionary.Dictionary.build_vocab_file( corpus_files=[src_txt], vocab_file=f"{tmp_prefix}.src", max_vocab_size=1000 ) srctrg_dict = dictionary.Dictionary.build_vocab_file( corpus_files=[src_txt, trg_txt], vocab_file=f"{tmp_prefix}.srctrg", max_vocab_size=1000, ) self.assertEqual(len(max_vocab_dict), max_vocab_dict.nspecial) max_vocab_dict.push(src_dict) self.assertEqual(len(max_vocab_dict), len(src_dict)) max_vocab_dict.push(srctrg_dict) self.assertEqual(len(max_vocab_dict), len(srctrg_dict)) max_vocab_dict.push(src_dict) self.assertEqual(len(max_vocab_dict), len(srctrg_dict)) os.remove(f"{tmp_prefix}.src") os.remove(f"{tmp_prefix}.srctrg") os.remove(src_txt) os.remove(trg_txt)
def setUp(self): ( self.source_text_file, self.target_text_file, ) = test_utils.create_test_text_files()