def generate_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN dataset_path = (ENZH_RAW_DATASETS["TRAIN"] if train else ENZH_RAW_DATASETS["DEV"]) train_path = get_enzh_raw_dataset(tmp_dir, dataset_path) return text_problems.text2text_txt_iterator(train_path + ".en", train_path + ".zh")
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = self.source_data_files(dataset_split) """ for item in datasets: dummy_file_name = item[0].split('/')[-1] create_dummy_tar(tmp_dir, dummy_file_name) s_file, t_file = item[1][0], item[1][1] if not os.path.exists(os.path.join(tmp_dir, s_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % s_file) if not os.path.exists(os.path.join(tmp_dir, t_file)): raise Exception("Be sure file '%s' is exists in tmp dir" % t_file) """ source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][0]]] for item in train_dataset] source_vocab_filename = os.path.join(data_dir, self.source_vocab_name) target_vocab_filename = os.path.join(data_dir, self.target_vocab_name) source_encoder = text_encoder.TokenTextEncoder( source_vocab_filename, replace_oov=self.oov_token) target_encoder = text_encoder.TokenTextEncoder( target_vocab_filename, replace_oov=self.oov_token) tag = "train" if train else "dev" filename_base = "%s-compiled-%s" % (self.name, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + '.lang1', data_path + '.lang2'), source_encoder, target_encoder)
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
def generate_samples(self, data_dir, tmp_dir, dataset_split): # Vocab src_token_path = (os.path.join(data_dir, self.source_vocab_name), self.source_vocab_name) target_token_path = (os.path.join(data_dir, self.target_vocab_name), self.target_vocab_name) for token_path, vocab_name in [src_token_path, target_token_path]: if not tf.gfile.Exists(token_path): bpe_vocab = os.path.join(tmp_dir, vocab_name) with tf.gfile.Open(bpe_vocab) as f: vocab_list = f.read().split("\n") vocab_list.append(self.oov_token) text_encoder.TokenTextEncoder( None, vocab_list=vocab_list).store_to_file(token_path) tag = 'eval' if dataset_split == problem.DatasetSplit.TRAIN: tag = 'train' fn_inputs = os.path.join(tmp_dir, "inputs.{}.txt".format(tag)) fn_targets = os.path.join(tmp_dir, "targets.{}.txt".format(tag)) return text_problems.text2text_txt_iterator(fn_inputs, fn_targets)
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) lang1_fname = filename + ".en" lang2_fname = filename + ".sv" if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname) with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for dataset in datasets: path = dataset[0] lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(path, lang1_filename) lang2_filepath = os.path.join(path, lang2_filename) is_sgm = (lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) for example in text_problems.text2text_txt_iterator( lang1_filepath, lang2_filepath): line1res = _preprocess_sgm(example["inputs"], is_sgm) line2res = _preprocess_sgm(example["targets"], is_sgm) if line1res and line2res: lang1_resfile.write(line1res) lang1_resfile.write("\n") lang2_resfile.write(line2res) lang2_resfile.write("\n") return filename
def generate_samples(self, data_dir, tmp_dir, dataset_split): if dataset_split == problem.DatasetSplit.TRAIN: source_file_name = self.compile_corpus_files( data_dir, SOURCE_TRAIN_FILES, self.source_compiled_corpus_filename[0]) target_file_name = self.compile_corpus_files( data_dir, TARGET_TRAIN_FILES, self.target_compiled_corpus_filename[0]) elif dataset_split == problem.DatasetSplit.EVAL: source_file_name = self.compile_corpus_files( data_dir, SOURCE_DEV_FILES, self.source_compiled_corpus_filename[1]) target_file_name = self.compile_corpus_files( data_dir, TARGET_DEV_FILES, self.target_compiled_corpus_filename[1]) elif dataset_split == problem.DatasetSplit.TEST: source_file_name = self.compile_corpus_files( data_dir, SOURCE_TEST_FILES, self.source_compiled_corpus_filename[2]) target_file_name = self.compile_corpus_files( data_dir, TARGET_TEST_FILES, self.target_compiled_corpus_filename[2]) return text_problems.text2text_txt_iterator( os.path.join(data_dir, source_file_name), os.path.join(data_dir, target_file_name))
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = my_spm_utils.get_or_generate_spm( data_dir, tmp_dir, vocab_size=self.approx_vocab_size, model_prefix=self.source_vocab_name, sources=source_datasets, file_byte_budget=1e10) target_vocab = my_spm_utils.get_or_generate_spm( data_dir, tmp_dir, vocab_size=int(self.approx_vocab_size / 2), model_prefix=self.target_vocab_name, sources=target_datasets, file_byte_budget=1e10) tag = "train" if train else "dev" filename_base = "wmt_enja_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generate_text_for_vocab(self, data_dir, tmp_dir): files = [os.path.join(tmp_dir, f) for f in self.TRAIN_FILES] inputs_file, targets_file = files for sample in text_problems.text2text_txt_iterator( inputs_file, targets_file): yield sample["inputs"] yield sample["targets"]
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN source_datasets = _INPUT_FILES target_datasets = _OUTPUT_FILES source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "test" filename_src = "en_{}.src".format(tag) filename_dst = "ru_{}.dst".format(tag) data_path = './shad_nlp18_contextNMT/data_fused/' return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + filename_src, data_path + filename_dst), source_vocab, target_vocab)
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" # create shared vocabulary if self.vocab_type == "subwords": data_path = translate.compile_data( tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) self.get_or_create_vocab(data_dir, tmp_dir) sample_iterator = text_problems.text2text_txt_iterator( data_path + ".lang1", data_path + ".lang2") elif self.vocab_type == "tokens": sample_iterator = super().generate_samples(data_dir, tmp_dir, dataset_split) else: raise ValueError("VocabType not supported") # create source feature vocabularies data_path = self.compile_sfeat_data( tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) self.create_src_feature_vocabs(data_dir, tmp_dir) sfeat_iterator = text_problems.txt_line_iterator(data_path + ".sfeat") def _generate(sample_iterator, sfeat_iterator): for sample in sample_iterator: sample["sfeats"] = next(sfeat_iterator) yield sample return _generate(sample_iterator, sfeat_iterator)
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Instance of token generator for the WMT en->de task, training set.""" dataset_path = ("num2text-p8-v7/num2text") train_path = _get_num2text_dataset(tmp_dir, dataset_path) return text_problems.text2text_txt_iterator( train_path + "_num_p8_v7.txt", train_path + "_txt_p8_v7.txt")
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset #if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, source_datasets, file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.target_vocab_name, self.approx_vocab_size, target_datasets, file_byte_budget=1e8) tag = "train" if train else "dev" filename_base = "%s" % (tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".source", data_path + ".target"), source_vocab, target_vocab)
def generate_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
def generate_samples(self, data_dir, tmp_dir, dataset_split): is_train_dataset = dataset_split == problem.DatasetSplit.TRAIN dataset_label = 'train' if is_train_dataset else 'dev' ext = '.txt' he_path = os.path.join(data_dir, 'he.'+dataset_label+ext) en_path = os.path.join(data_dir, 'en.'+dataset_label+ext) return text_problems.text2text_txt_iterator(he_path, en_path)
def generate_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
def testText2TextTxtIterator(self): inputs = [] targets = [] for entry in text_problems.text2text_txt_iterator(self.inputs_file, self.targets_file): inputs.append(entry["inputs"]) targets.append(entry["targets"]) self.assertEqual(inputs, self.inputs) self.assertEqual(targets, self.targets)
def generate_samples(self, data_dir, tmp_dir, dataset_split): training_source_file = os.path.join(config.DATA_DIR, 'training_source.txt') training_target_file = os.path.join(config.DATA_DIR, 'training_target.txt') dev_source_file = os.path.join(config.DATA_DIR, 'dev_source.txt') dev_target_file = os.path.join(config.DATA_DIR, 'dev_target.txt') train = dataset_split == problem.DatasetSplit.TRAIN source_file = (training_source_file if train else dev_source_file) target_file = (training_target_file if train else dev_target_file) # def generator_samples_content(get_source, get_target): # source, target = None, None # # with tf.gfile.GFile(source_file, mode='r') as f_x_train, \ # tf.gfile.GFile(target_file, mode='r') as f_y_train: # # mrs = f_x_train.read().splitlines() # utterances = f_y_train.read().splitlines() # # for mr, utt in zip(mrs, utterances): # yield mr, utt # # def generator_source(): # for source, _ in generator_samples_content(False, True): # yield source.strip() # # def generator_target(): # for _, target in generator_samples_content(False, True): # yield target.strip() # # # Generate vocab for both source and target # source_vocab = generator_utils.get_or_generate_vocab_inner( # data_dir=data_dir, # vocab_filename=self.vocab_input_filename, # vocab_size=self.input_vocab_size, # generator=generator_source()) # # target_vocab = generator_utils.get_or_generate_vocab_inner( # data_dir=data_dir, # vocab_filename=self.vocab_target_filename, # vocab_size=self.target_vocab_size, # generator=generator_target()) # # with io.open('data/training_source.txt', 'r', encoding='utf8') as f_x_train, \ # io.open('data/training_target.txt', 'r', encoding='utf8') as f_y_train: # mrs = f_x_train.read().splitlines() # utterances = f_y_train.read().splitlines() # # for mr, utt in zip(mrs, utterances): # yield { # 'inputs': mr, # 'targets': utt # } return text_problems.text2text_txt_iterator(source_file, target_file)
def testText2TextTxtIterator(self): inputs = [] targets = [] for entry in text_problems.text2text_txt_iterator( self.inputs_file, self.targets_file): inputs.append(entry["inputs"]) targets.append(entry["targets"]) self.assertEqual(inputs, self.inputs) self.assertEqual(targets, self.targets)
def generate_text_for_vocab(self, data_dir, tmp_dir): files = [os.path.join(tmp_dir, f) for f in self.TRAIN_FILES] inputs_file, targets_file = files for i, sample in enumerate(text_problems.text2text_txt_iterator(inputs_file, targets_file )): yield sample["inputs"] yield sample["targets"] if self.max_samples_for_vocab and (i + 1) >= self.max_samples_for_vocab: break
def generate_samples(self, data_dir, _tmp_dir, dataset_split): #pylint: disable=no-self-use """This method returns the generator to return {"inputs": [text], "targets": [text]} dict""" functions_file_path = os.path.join(data_dir, '{}.function'.format(dataset_split)) docstrings_file_path = os.path.join( data_dir, '{}.docstring'.format(dataset_split)) return text_problems.text2text_txt_iterator(functions_file_path, docstrings_file_path)
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Instance of token generator for the WMT en->de task, training set.""" if dataset_split == problem.DatasetSplit.TRAIN: dataset_path = OUT_SENTS else: dataset_path = OUT_TEST return text_problems.text2text_txt_iterator(dataset_path + "en", dataset_path + "fr")
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Returns the generator of {"inputs": [text], "targets": [text]} dict.""" functions_file_path = os.path.join( data_dir, '{}.function'.format(dataset_split)) docstrings_file_path = os.path.join( data_dir, '{}.docstring'.format(dataset_split)) return text_problems.text2text_txt_iterator( functions_file_path, docstrings_file_path)
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Returns the generator of {"inputs": [text], "targets": [text]} dict.""" functions_file_path = os.path.join(data_dir, '{}.function'.format(dataset_split)) docstrings_file_path = os.path.join( data_dir, '{}.docstring'.format(dataset_split)) return text_problems.text2text_txt_iterator(functions_file_path, docstrings_file_path)
def generate_samples(self, data_dir, tmp_dir, dataset_split): is_train_dataset = dataset_split == problem.DatasetSplit.TRAIN ext = '.txt' dataset_label = os.getenv( 'TRAIN_NAME') if is_train_dataset else os.getenv('DEV_NAME') original_data_dir = os.getenv('DATA_DIR') he_path = os.path.join(original_data_dir, 'he.' + dataset_label + ext) en_path = os.path.join(original_data_dir, 'en.' + dataset_label + ext) return text_problems.text2text_txt_iterator(he_path, en_path)
def generate_samples(self, data_dir, tmp_dir, dataset_split): training_source_file = os.path.join(config.DATA_DIR, 'training_source.txt') training_target_file = os.path.join(config.DATA_DIR, 'training_target.txt') dev_source_file = os.path.join(config.DATA_DIR, 'dev_source.txt') dev_target_file = os.path.join(config.DATA_DIR, 'dev_target.txt') train = dataset_split == problem.DatasetSplit.TRAIN source_file = (training_source_file if train else dev_source_file) target_file = (training_target_file if train else dev_target_file) return text_problems.text2text_txt_iterator(source_file, target_file)
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = translate.compile_data( tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) # For eval, use authentic data. if dataset_split != problem.DatasetSplit.TRAIN: for example in text_problems.text2text_txt_iterator( data_path + ".lang1", data_path + ".lang2"): yield example else: # For training, mix synthetic and authentic data as follows. for (file1, file2) in self.backtranslate_data_filenames: path1 = os.path.join(tmp_dir, file1) path2 = os.path.join(tmp_dir, file2) # Synthetic data first. for example in text_problems.text2text_txt_iterator(path1, path2): yield example # Now authentic data. for example in text_problems.text2text_txt_iterator( data_path + ".lang1", data_path + ".lang2"): yield example
def testCompileData(self): filename = "out" filepath = os.path.join(self.tmp_dir, filename) translate.compile_data(self.tmp_dir, self.DATASETS, filename) count = 0 for i, example in enumerate( text_problems.text2text_txt_iterator(filepath + ".lang1", filepath + ".lang2")): expected = self.data[i] self.assertEqual(list(expected), [example["inputs"], example["targets"]]) count += 1 self.assertEqual(count, len(self.data))
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) return text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2")
def testCompileData(self): filename = "out" filepath = os.path.join(self.tmp_dir, filename) translate.compile_data(self.tmp_dir, self.DATASETS, filename) count = 0 for i, example in enumerate( text_problems.text2text_txt_iterator(filepath + ".lang1", filepath + ".lang2")): expected = self.data[i] self.assertEqual(list(expected), [example["inputs"], example["targets"]]) count += 1 self.assertEqual(count, len(self.data))
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN datasets = _TRAIN_DATASETS if train else _DEV_DATASETS vocab_list = [] print("=======Get Vocab from ", self.vocab_name, '...', end='') with open(self.vocab_name, 'r', encoding='utf-8') as f: vocab_list = f.read().splitlines() print("=======Done") vocab = text_encoder.TokenTextEncoder(vocab_filename=None, vocab_list=vocab_list, replace_oov="<UNK>", num_reserved_ids=text_encoder.NUM_RESERVED_TOKENS) return text_problems.text2text_generate_encoded(text_problems.text2text_txt_iterator(datasets[0], datasets[1]), vocab, vocab)
def generate_samples(self, data_dir, tmp_dir, dataset_split): datasets = self.source_data_files(dataset_split) tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev" data_path = translate.compile_data( tmp_dir, datasets, "%s-compiled-%s" % (self.name, tag)) # Iterator over authentic data. it_auth = text_problems.text2text_txt_iterator( data_path + ".lang1", data_path + ".lang2") # For eval, use authentic data. if dataset_split != problem.DatasetSplit.TRAIN: for example in it_auth: yield example else: # For training, mix synthetic and authentic data as follows. for (file1, file2) in self.backtranslate_data_filenames: path1 = os.path.join(tmp_dir, file1) path2 = os.path.join(tmp_dir, file2) # Synthetic data first. for example in text_problems.text2text_txt_iterator(path1, path2): yield example # Now authentic data. for example in it_auth: yield example
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.source_vocab_name), replace_oov=self.oov_token) target_vocab = text_encoder.TokenTextEncoder(os.path.join(data_dir, self.target_vocab_name), replace_oov=self.oov_token) tag = "train" if train else "dev" filename_base = "wmt_pdre_tok_%s" % tag data_path = translate.compile_data(tmp_dir, datasets, filename_base) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), source_vocab, target_vocab)
def generate_samples(self, data_dir, tmp_dir, dataset_split): train = dataset_split == problem.DatasetSplit.TRAIN train_path = os.path.join( tmp_dir, "oversampled", GEC_DATASETS["TRAIN"]) if train else os.path.join( tmp_dir, "wi_locness", GEC_DATASETS["DEV"]) vocab_path = os.path.join(tmp_dir, GEC_DATASETS["VOCAB"]) vocab_path_src_tgt = os.path.join(data_dir, GEC_DATASETS["VOCAB"]) if not tf.gfile.Exists(vocab_path_src_tgt): tf.gfile.Copy(vocab_path, vocab_path_src_tgt) with tf.gfile.GFile(vocab_path_src_tgt, mode="r") as fr: vocab_data = "<pad>\n<EOS>\n" + fr.read() with tf.gfile.GFile(vocab_path_src_tgt, mode="w") as fw: fw.write(vocab_data) return text_problems.text2text_txt_iterator(train_path + ".src", train_path + ".tgt")
def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): symbolizer_vocab = generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.source_vocab_name, self.approx_vocab_size, _ZHZH_TRAIN_DATASETS, file_byte_budget=1e8) train = dataset_split == problem.DatasetSplit.TRAIN datasets = _ZHZH_TRAIN_DATASETS if train else _ZHZH_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "mydata_enzh_tok_%s" % tag) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"), symbolizer_vocab, symbolizer_vocab)
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Instance of token generator for the WMT en->de task, training set.""" train = dataset_split == problem.DatasetSplit.TRAIN dataset_path = ("train.tok.clean.bpe.32000" if train else "newstest2013.tok.bpe.32000") train_path = _get_wmt_ende_bpe_dataset(tmp_dir, dataset_path) # Vocab vocab_path = os.path.join(data_dir, self.vocab_filename) if not tf.gfile.Exists(vocab_path): bpe_vocab = os.path.join(tmp_dir, "vocab.bpe.32000") with tf.gfile.Open(bpe_vocab) as f: vocab_list = f.read().split("\n") vocab_list.append(self.oov_token) text_encoder.TokenTextEncoder( None, vocab_list=vocab_list).store_to_file(vocab_path) return text_problems.text2text_txt_iterator(train_path + ".en", train_path + ".de")
def generate_samples(self, data_dir, tmp_dir, dataset_split): """Instance of token generator for the WMT en->de task, training set.""" train = dataset_split == problem.DatasetSplit.TRAIN dataset_path = ("train.tok.clean.bpe.32000" if train else "newstest2013.tok.bpe.32000") train_path = _get_wmt_ende_bpe_dataset(tmp_dir, dataset_path) # Vocab vocab_path = os.path.join(data_dir, self.vocab_filename) if not tf.gfile.Exists(vocab_path): bpe_vocab = os.path.join(tmp_dir, "vocab.bpe.32000") with tf.gfile.Open(bpe_vocab) as f: vocab_list = f.read().split("\n") vocab_list.append(self.oov_token) text_encoder.TokenTextEncoder( None, vocab_list=vocab_list).store_to_file(vocab_path) return text_problems.text2text_txt_iterator(train_path + ".en", train_path + ".de")
def generate_samples(self, data_dir, tmp_dir, dataset_split): dataset = self.dataset_url(dataset_split) url = dataset[0][0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) generator_utils.maybe_download(tmp_dir, compressed_filename, url) mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if self.vocab_type == text_problems.VocabType.SUBWORD: generator_utils.get_or_generate_vocab( data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, self.vocab_data_files()) source_file, target_file = self.source_target_paths(dataset_split, tmp_dir) return text_problems.text2text_txt_iterator(source_file, target_file)
def compile_data(tmp_dir, datasets, filename): """Concatenate all `datasets` and save to `filename`.""" filename = os.path.join(tmp_dir, filename) lang1_fname = filename + ".lang1" lang2_fname = filename + ".lang2" if tf.gfile.Exists(lang1_fname) and tf.gfile.Exists(lang2_fname): tf.logging.info("Skipping compile data, found files:\n%s\n%s", lang1_fname, lang2_fname) return filename with tf.gfile.GFile(lang1_fname, mode="w") as lang1_resfile: with tf.gfile.GFile(lang2_fname, mode="w") as lang2_resfile: for dataset in datasets: url = dataset[0] compressed_filename = os.path.basename(url) compressed_filepath = os.path.join(tmp_dir, compressed_filename) if url.startswith("http"): generator_utils.maybe_download(tmp_dir, compressed_filename, url) if dataset[1][0] == "tsv": _, src_column, trg_column, glob_pattern = dataset[1] filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) if not filenames: # Capture *.tgz and *.tar.gz too. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) filenames = tf.gfile.Glob(os.path.join(tmp_dir, glob_pattern)) for tsv_filename in filenames: if tsv_filename.endswith(".gz"): new_filename = tsv_filename.strip(".gz") generator_utils.gunzip_file(tsv_filename, new_filename) tsv_filename = new_filename with tf.gfile.Open(tsv_filename) as tsv_file: for line in tsv_file: if line and "\t" in line: parts = line.split("\t") source, target = parts[src_column], parts[trg_column] source, target = source.strip(), target.strip() if source and target: lang1_resfile.write(source) lang1_resfile.write("\n") lang2_resfile.write(target) lang2_resfile.write("\n") else: lang1_filename, lang2_filename = dataset[1] lang1_filepath = os.path.join(tmp_dir, lang1_filename) lang2_filepath = os.path.join(tmp_dir, lang2_filename) is_sgm = ( lang1_filename.endswith("sgm") and lang2_filename.endswith("sgm")) if not (tf.gfile.Exists(lang1_filepath) and tf.gfile.Exists(lang2_filepath)): # For .tar.gz and .tgz files, we read compressed. mode = "r:gz" if compressed_filepath.endswith("gz") else "r" with tarfile.open(compressed_filepath, mode) as corpus_tar: corpus_tar.extractall(tmp_dir) if lang1_filepath.endswith(".gz"): new_filepath = lang1_filepath.strip(".gz") generator_utils.gunzip_file(lang1_filepath, new_filepath) lang1_filepath = new_filepath if lang2_filepath.endswith(".gz"): new_filepath = lang2_filepath.strip(".gz") generator_utils.gunzip_file(lang2_filepath, new_filepath) lang2_filepath = new_filepath for example in text_problems.text2text_txt_iterator( lang1_filepath, lang2_filepath): line1res = _preprocess_sgm(example["inputs"], is_sgm) line2res = _preprocess_sgm(example["targets"], is_sgm) if line1res and line2res: lang1_resfile.write(line1res) lang1_resfile.write("\n") lang2_resfile.write(line2res) lang2_resfile.write("\n") return filename