Ejemplo n.º 1
0
 def generator(self, data_dir, tmp_dir, train):
     character_vocab = text_encoder.ByteTextEncoder()
     datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate._compile_data(tmp_dir, datasets,
                                         "wmt_encs_chr_%s" % tag)
     return translate.character_generator(data_path + ".lang1",
                                          data_path + ".lang2",
                                          character_vocab, EOS)
Ejemplo n.º 2
0
 def generator(self, data_dir, tmp_dir, train):
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         _ENDE_TRAIN_DATASETS)
     datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
     tag = "train" if train else "dev"
     data_path = translate._compile_data(tmp_dir, datasets,
                                         "wmt_ende_tok_%s" % tag)
     return translate.token_generator(data_path + ".lang1",
                                      data_path + ".lang2",
                                      symbolizer_vocab, EOS)
Ejemplo n.º 3
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in datasets]
   target_datasets = [[item[0], [item[1][1]]] for item in datasets]
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       source_datasets + target_datasets)
   tag = "train" if train else "dev"
   data_path = translate._compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
   # We generate English->X data by convention, to train reverse translation
   # just add the "_rev" suffix to the problem name, e.g., like this.
   #   --problems=translate_enmk_setimes32k_rev
   return translate.token_generator(data_path + ".lang2", data_path + ".lang1",
                          symbolizer_vocab, EOS)
Ejemplo n.º 4
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _ZHEN_TRAIN_DATASETS if train else _ZHEN_TEST_DATASETS
     source_datasets = [[item[0], [item[1][0]]]
                        for item in _ZHEN_TRAIN_DATASETS]
     target_datasets = [[item[0], [item[1][1]]]
                        for item in _ZHEN_TRAIN_DATASETS]
     source_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.source_vocab_name,
         self.targeted_vocab_size, source_datasets)
     target_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.target_vocab_name,
         self.targeted_vocab_size, target_datasets)
     tag = "train" if train else "dev"
     data_path = translate._compile_data(tmp_dir, datasets,
                                         "wmt_zhen_tok_%s" % tag)
     # We generate English->X data by convention, to train reverse translation
     # just add the "_rev" suffix to the problem name, e.g., like this.
     #   --problems=translate_enzh_wmt8k_rev
     return translate.bi_vocabs_token_generator(data_path + ".lang2",
                                                data_path + ".lang1",
                                                source_vocab, target_vocab,
                                                EOS)
Ejemplo n.º 5
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
     tag = "train" if train else "dev"
     vocab_datasets = []
     data_path = translate._compile_data(tmp_dir, datasets,
                                         "wmt_encs_tok_%s" % tag)
     # CzEng contains 100 gz files with tab-separated columns, so let's expect
     # it is the first dataset in datasets and use the newly created *.lang{1,2}
     # files for vocab construction.
     if datasets[0][0].endswith("data-plaintext-format.tar"):
         vocab_datasets.append([
             datasets[0][0],
             ["wmt_encs_tok_%s.lang1" % tag,
              "wmt_encs_tok_%s.lang2" % tag]
         ])
         datasets = datasets[1:]
     vocab_datasets += [[item[0], [item[1][0], item[1][1]]]
                        for item in datasets]
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         vocab_datasets)
     return translate.token_generator(data_path + ".lang1",
                                      data_path + ".lang2",
                                      symbolizer_vocab, EOS)