def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, _ENDE_TRAIN_DATASETS) datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_ende_tok_%s" % tag) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, _TRAIN_DATASETS["de-es"]) datasets = _TRAIN_DATASETS["de-es"] if train else _TEST_DATASETS[ "de-es"] tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "legal_dees_tok_%s" % tag) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, _ENFR_TRAIN_SMALL_DATA) if self.use_small_dataset: datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA else: datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_enfr_tok_%s" % tag) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): symbolizer_vocab = generator_utils.get_or_generate_txt_vocab( data_dir, self.vocab_file, self.targeted_vocab_size, filepatterns=[ENDE_TRAIN_TOK_SRC, ENDE_TRAIN_TOK_TRG]) if train: data_src = ENDE_TRAIN_TOK_SR data_trg = ENDE_TRAIN_TOK_TRG else: data_src = ENDE_DEV_TOK_SRC data_trg = ENDE_DEV_TOK_TRG return token_generator(data_src, data_trg, symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, source_datasets + target_datasets) tag = "train" if train else "dev" data_path = translate._compile_data(tmp_dir, datasets, "setimes_mken_tok_%s" % tag) # We generate English->X data by convention, to train reverse translation # just add the "_rev" suffix to the problem name, e.g., like this. # --problems=translate_enmk_setimes32k_rev return translate.token_generator(data_path + ".lang2", data_path + ".lang1", symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): """Instance of token generator for the WMT en->de task, training set.""" dataset_path = ("train.tok.clean.bpe.32000" if train else "newstest2013.tok.bpe.32000") train_path = _get_wmt_ende_bpe_dataset(tmp_dir, dataset_path) token_tmp_path = os.path.join(tmp_dir, self.vocab_file) token_path = os.path.join(data_dir, self.vocab_file) tf.gfile.Copy(token_tmp_path, token_path, overwrite=True) with tf.gfile.GFile(token_path, mode="a") as f: f.write("UNK\n") # Add UNK to the vocab. token_vocab = text_encoder.TokenTextEncoder(token_path, replace_oov="UNK") return translate.token_generator(train_path + ".en", train_path + ".de", token_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): """Instance of token generator for the WMT en->de task, training set.""" dataset_path = ("train.tok.clean.bpe.32000" if train else "newstest2013.tok.bpe.32000") train_path = _get_wmt_ende_bpe_dataset(tmp_dir, dataset_path) token_tmp_path = os.path.join(tmp_dir, self.vocab_file) token_path = os.path.join(data_dir, self.vocab_file) tf.gfile.Copy(token_tmp_path, token_path, overwrite=True) with tf.gfile.GFile(token_path, mode="r") as f: vocab_data = "<pad>\n<EOS>\n" + f.read() + "UNK\n" with tf.gfile.GFile(token_path, mode="w") as f: f.write(vocab_data) token_vocab = text_encoder.TokenTextEncoder(token_path, replace_oov="UNK") return translate.token_generator(train_path + ".en", train_path + ".de", token_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): tf.gfile.MakeDirs(data_dir) vocab_filepath_old = os.path.join(tmp_dir, self.shared_vocab_name) vocab_filepath_new = os.path.join(data_dir, self.shared_vocab_name) tf.gfile.Copy(vocab_filepath_old, vocab_filepath_new, overwrite=True) shared_vocab = text_encoder.TokenTextEncoder(vocab_filepath_new, replace_oov='<unk>') tag = "trn" if train else "dev" source_path = os.path.join(tmp_dir, "src.%s" % tag) target_path = os.path.join(tmp_dir, "tgt.%s" % tag) return translate.token_generator(source_path, target_path, shared_vocab, text_encoder.EOS_ID)
def generator(self, data_dir, tmp_dir, train): datasets = _TRAIN_DATASETS if train else _TEST_DATASETS source_datasets = [[FLAGS.raw_data_dir, [item[0]]] for item in datasets] target_datasets = [[FLAGS.raw_data_dir, [item[1]]] for item in datasets] # Copy vocab to data directory vocab_path = os.path.join(data_dir, self.vocab_name) if os.path.exists(vocab_path): os.remove(vocab_path) copyVocab(os.path.join(FLAGS.raw_data_dir, _VOCABS[2]), vocab_path) token_vocab = text_encoder.TokenTextEncoder(vocab_path, replace_oov="<unk>") tag = "train" if train else "dev" data_path = _compile_data(tmp_dir, datasets, "generic_tok_%s" % tag) return translate.token_generator(data_path + ".src", data_path + ".trg", token_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): print("jsme v subwords generatoru.....") vocab_path = os.path.join(tmp_dir, self.vocab_name) print(vocab_path) vocab_data_path = os.path.join(data_dir, self.vocab_name) corpus_src = os.path.join(tmp_dir, self.corpus_lang(train, self.SRC_LANG)) corpus_tgt = os.path.join(tmp_dir, self.corpus_lang(train, self.TGT_LANG)) symbolizer_vocab = self.generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, [ self.corpus_lang(train, self.SRC_LANG), self.corpus_lang(train, self.TGT_LANG) ]) return translate.token_generator(corpus_src, corpus_tgt, symbolizer_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): """Instance of token generator for the mn->zh task, training set.""" dataset_path = ("train.32k" if train else "valid.32k") train_path = os.path.join(data_dir, dataset_path) source_token_path = os.path.join(data_dir, self.source_vocab_name()) target_token_path = os.path.join(data_dir, self.target_vocab_name()) for token_path in [source_token_path, target_token_path]: with tf.gfile.GFile(token_path, mode="r") as f: vocab_data = "<pad>\n<EOS>\n" + f.read() + "UNK\n" with tf.gfile.GFile(token_path, mode="w") as f: f.write(vocab_data) source_token_vocab = text_encoder.TokenTextEncoder(source_token_path, replace_oov="UNK") target_token_vocab = text_encoder.TokenTextEncoder(source_token_path, replace_oov="UNK") return translate.token_generator(train_path + ".mn", train_path + ".zh", source_token_vocab, target_token_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): """Instance of token generator for the WMT en->de task, training set.""" # corpus = self.corpus(train) # corpus_path = os.path.join(tmp_dir, corpus) # print(corpus_path) vocab_path = os.path.join(tmp_dir, self.vocab_name) print(vocab_path) vocab_data_path = os.path.join(data_dir, self.vocab_name) corp_path_tgt = os.path.join(tmp_dir, self.corpus_lang(train, self.TGT_LANG)) corp_path_src = os.path.join(tmp_dir, self.corpus_lang(train, self.SRC_LANG)) if train: # create vocab here self._create_vocab([corp_path_tgt, corp_path_src], vocab_path) tf.gfile.Copy(vocab_path, vocab_data_path, overwrite=True) self._add_unk_to_vocab(vocab_data_path) # Add UNK to the vocab. token_vocab = text_encoder.TokenTextEncoder(vocab_data_path, replace_oov=UNK) # opraveno return translate.token_generator(corp_path_src, corp_path_tgt, token_vocab, EOS)
def generator(self, data_dir, tmp_dir, train): datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS tag = "train" if train else "dev" vocab_datasets = [] data_path = translate._compile_data(tmp_dir, datasets, "wmt_encs_tok_%s" % tag) # CzEng contains 100 gz files with tab-separated columns, so let's expect # it is the first dataset in datasets and use the newly created *.lang{1,2} # files for vocab construction. if datasets[0][0].endswith("data-plaintext-format.tar"): vocab_datasets.append([ datasets[0][0], ["wmt_encs_tok_%s.lang1" % tag, "wmt_encs_tok_%s.lang2" % tag] ]) datasets = datasets[1:] vocab_datasets += [[item[0], [item[1][0], item[1][1]]] for item in datasets] symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, vocab_datasets) return translate.token_generator(data_path + ".lang1", data_path + ".lang2", symbolizer_vocab, EOS)