def generator(self, data_dir, tmp_dir, train):
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       _ENDE_TRAIN_DATASETS)
   datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_ende_tok_%s" % tag)
   return translate.token_generator(data_path + ".lang1", data_path + ".lang2",
                                    symbolizer_vocab, EOS)
 def generator(self, data_dir, tmp_dir, train):
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         _TRAIN_DATASETS["de-es"])
     datasets = _TRAIN_DATASETS["de-es"] if train else _TEST_DATASETS[
         "de-es"]
     tag = "train" if train else "dev"
     data_path = translate.compile_data(tmp_dir, datasets,
                                        "legal_dees_tok_%s" % tag)
     return translate.token_generator(data_path + ".lang1",
                                      data_path + ".lang2",
                                      symbolizer_vocab, EOS)
Exemple #3
0
 def generator(self, data_dir, tmp_dir, train):
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       _ENFR_TRAIN_SMALL_DATA)
   if self.use_small_dataset:
     datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA
   else:
     datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA
   tag = "train" if train else "dev"
   data_path = translate.compile_data(tmp_dir, datasets,
                                      "wmt_enfr_tok_%s" % tag)
   return translate.token_generator(data_path + ".lang1", data_path + ".lang2",
                                    symbolizer_vocab, EOS)
Exemple #4
0
 def generator(self, data_dir, tmp_dir, train):
     symbolizer_vocab = generator_utils.get_or_generate_txt_vocab(
         data_dir,
         self.vocab_file,
         self.targeted_vocab_size,
         filepatterns=[ENDE_TRAIN_TOK_SRC, ENDE_TRAIN_TOK_TRG])
     if train:
         data_src = ENDE_TRAIN_TOK_SR
         data_trg = ENDE_TRAIN_TOK_TRG
     else:
         data_src = ENDE_DEV_TOK_SRC
         data_trg = ENDE_DEV_TOK_TRG
     return token_generator(data_src, data_trg, symbolizer_vocab, EOS)
Exemple #5
0
 def generator(self, data_dir, tmp_dir, train):
   datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
   source_datasets = [[item[0], [item[1][0]]] for item in datasets]
   target_datasets = [[item[0], [item[1][1]]] for item in datasets]
   symbolizer_vocab = generator_utils.get_or_generate_vocab(
       data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
       source_datasets + target_datasets)
   tag = "train" if train else "dev"
   data_path = translate._compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag)
   # We generate English->X data by convention, to train reverse translation
   # just add the "_rev" suffix to the problem name, e.g., like this.
   #   --problems=translate_enmk_setimes32k_rev
   return translate.token_generator(data_path + ".lang2", data_path + ".lang1",
                          symbolizer_vocab, EOS)
Exemple #6
0
 def generator(self, data_dir, tmp_dir, train):
     """Instance of token generator for the WMT en->de task, training set."""
     dataset_path = ("train.tok.clean.bpe.32000"
                     if train else "newstest2013.tok.bpe.32000")
     train_path = _get_wmt_ende_bpe_dataset(tmp_dir, dataset_path)
     token_tmp_path = os.path.join(tmp_dir, self.vocab_file)
     token_path = os.path.join(data_dir, self.vocab_file)
     tf.gfile.Copy(token_tmp_path, token_path, overwrite=True)
     with tf.gfile.GFile(token_path, mode="a") as f:
         f.write("UNK\n")  # Add UNK to the vocab.
     token_vocab = text_encoder.TokenTextEncoder(token_path,
                                                 replace_oov="UNK")
     return translate.token_generator(train_path + ".en",
                                      train_path + ".de", token_vocab, EOS)
 def generator(self, data_dir, tmp_dir, train):
   """Instance of token generator for the WMT en->de task, training set."""
   dataset_path = ("train.tok.clean.bpe.32000"
                   if train else "newstest2013.tok.bpe.32000")
   train_path = _get_wmt_ende_bpe_dataset(tmp_dir, dataset_path)
   token_tmp_path = os.path.join(tmp_dir, self.vocab_file)
   token_path = os.path.join(data_dir, self.vocab_file)
   tf.gfile.Copy(token_tmp_path, token_path, overwrite=True)
   with tf.gfile.GFile(token_path, mode="r") as f:
     vocab_data = "<pad>\n<EOS>\n" + f.read() + "UNK\n"
   with tf.gfile.GFile(token_path, mode="w") as f:
     f.write(vocab_data)
   token_vocab = text_encoder.TokenTextEncoder(token_path, replace_oov="UNK")
   return translate.token_generator(train_path + ".en", train_path + ".de",
                                    token_vocab, EOS)
Exemple #8
0
    def generator(self, data_dir, tmp_dir, train):
        tf.gfile.MakeDirs(data_dir)

        vocab_filepath_old = os.path.join(tmp_dir, self.shared_vocab_name)
        vocab_filepath_new = os.path.join(data_dir, self.shared_vocab_name)
        tf.gfile.Copy(vocab_filepath_old, vocab_filepath_new, overwrite=True)
        shared_vocab = text_encoder.TokenTextEncoder(vocab_filepath_new,
                                                     replace_oov='<unk>')

        tag = "trn" if train else "dev"

        source_path = os.path.join(tmp_dir, "src.%s" % tag)
        target_path = os.path.join(tmp_dir, "tgt.%s" % tag)
        return translate.token_generator(source_path, target_path,
                                         shared_vocab, text_encoder.EOS_ID)
 def generator(self, data_dir, tmp_dir, train):
     datasets = _TRAIN_DATASETS if train else _TEST_DATASETS
     source_datasets = [[FLAGS.raw_data_dir, [item[0]]]
                        for item in datasets]
     target_datasets = [[FLAGS.raw_data_dir, [item[1]]]
                        for item in datasets]
     # Copy vocab to data directory
     vocab_path = os.path.join(data_dir, self.vocab_name)
     if os.path.exists(vocab_path):
         os.remove(vocab_path)
     copyVocab(os.path.join(FLAGS.raw_data_dir, _VOCABS[2]), vocab_path)
     token_vocab = text_encoder.TokenTextEncoder(vocab_path,
                                                 replace_oov="<unk>")
     tag = "train" if train else "dev"
     data_path = _compile_data(tmp_dir, datasets, "generic_tok_%s" % tag)
     return translate.token_generator(data_path + ".src",
                                      data_path + ".trg", token_vocab, EOS)
    def generator(self, data_dir, tmp_dir, train):
        print("jsme v subwords generatoru.....")
        vocab_path = os.path.join(tmp_dir, self.vocab_name)
        print(vocab_path)

        vocab_data_path = os.path.join(data_dir, self.vocab_name)

        corpus_src = os.path.join(tmp_dir,
                                  self.corpus_lang(train, self.SRC_LANG))
        corpus_tgt = os.path.join(tmp_dir,
                                  self.corpus_lang(train, self.TGT_LANG))

        symbolizer_vocab = self.generate_vocab(
            data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, [
                self.corpus_lang(train, self.SRC_LANG),
                self.corpus_lang(train, self.TGT_LANG)
            ])
        return translate.token_generator(corpus_src, corpus_tgt,
                                         symbolizer_vocab, EOS)
Exemple #11
0
    def generator(self, data_dir, tmp_dir, train):
        """Instance of token generator for the mn->zh task, training set."""
        dataset_path = ("train.32k" if train else "valid.32k")
        train_path = os.path.join(data_dir, dataset_path)

        source_token_path = os.path.join(data_dir, self.source_vocab_name())
        target_token_path = os.path.join(data_dir, self.target_vocab_name())
        for token_path in [source_token_path, target_token_path]:
            with tf.gfile.GFile(token_path, mode="r") as f:
                vocab_data = "<pad>\n<EOS>\n" + f.read() + "UNK\n"
            with tf.gfile.GFile(token_path, mode="w") as f:
                f.write(vocab_data)
        source_token_vocab = text_encoder.TokenTextEncoder(source_token_path,
                                                           replace_oov="UNK")
        target_token_vocab = text_encoder.TokenTextEncoder(source_token_path,
                                                           replace_oov="UNK")
        return translate.token_generator(train_path + ".mn",
                                         train_path + ".zh",
                                         source_token_vocab,
                                         target_token_vocab, EOS)
 def generator(self, data_dir, tmp_dir, train):
     """Instance of token generator for the WMT en->de task, training set."""
     #		corpus = self.corpus(train)
     #		corpus_path = os.path.join(tmp_dir, corpus)
     #		print(corpus_path)
     vocab_path = os.path.join(tmp_dir, self.vocab_name)
     print(vocab_path)
     vocab_data_path = os.path.join(data_dir, self.vocab_name)
     corp_path_tgt = os.path.join(tmp_dir,
                                  self.corpus_lang(train, self.TGT_LANG))
     corp_path_src = os.path.join(tmp_dir,
                                  self.corpus_lang(train, self.SRC_LANG))
     if train:
         # create vocab here
         self._create_vocab([corp_path_tgt, corp_path_src], vocab_path)
         tf.gfile.Copy(vocab_path, vocab_data_path, overwrite=True)
         self._add_unk_to_vocab(vocab_data_path)  # Add UNK to the vocab.
     token_vocab = text_encoder.TokenTextEncoder(vocab_data_path,
                                                 replace_oov=UNK)
     # opraveno
     return translate.token_generator(corp_path_src, corp_path_tgt,
                                      token_vocab, EOS)
Exemple #13
0
 def generator(self, data_dir, tmp_dir, train):
     datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS
     tag = "train" if train else "dev"
     vocab_datasets = []
     data_path = translate._compile_data(tmp_dir, datasets,
                                         "wmt_encs_tok_%s" % tag)
     # CzEng contains 100 gz files with tab-separated columns, so let's expect
     # it is the first dataset in datasets and use the newly created *.lang{1,2}
     # files for vocab construction.
     if datasets[0][0].endswith("data-plaintext-format.tar"):
         vocab_datasets.append([
             datasets[0][0],
             ["wmt_encs_tok_%s.lang1" % tag,
              "wmt_encs_tok_%s.lang2" % tag]
         ])
         datasets = datasets[1:]
     vocab_datasets += [[item[0], [item[1][0], item[1][1]]]
                        for item in datasets]
     symbolizer_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size,
         vocab_datasets)
     return translate.token_generator(data_path + ".lang1",
                                      data_path + ".lang2",
                                      symbolizer_vocab, EOS)