def tabbed_parsing_character_generator(tmp_dir, train): """Generate source and target data from a single file.""" character_vocab = text_encoder.ByteTextEncoder() filename = "parsing_{0}.pairs".format("train" if train else "dev") pair_filepath = os.path.join(tmp_dir, filename) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_tab_iterator(pair_filepath), character_vocab)
def tabbed_parsing_character_generator(tmp_dir, train): """Generate source and target data from a single file.""" character_vocab = text_encoder.ByteTextEncoder() filename = "parsing_{0}.pairs".format("train" if train else "dev") pair_filepath = os.path.join(tmp_dir, filename) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_tab_iterator(pair_filepath), character_vocab)
def testText2TextTxtTabIterator(self): inputs = [] targets = [] for entry in text_problems.text2text_txt_tab_iterator(self.tabbed_file): inputs.append(entry["inputs"]) targets.append(entry["targets"]) self.assertEqual(inputs, self.inputs) self.assertEqual(targets, self.targets)
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, source_vocab_size, target_vocab_size): """Generate source and target data from a single file.""" filename = "parsing_{0}.pairs".format("train" if train else "dev") source_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 0, prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size) target_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 1, prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) pair_filepath = os.path.join(tmp_dir, filename) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_tab_iterator(pair_filepath), source_vocab, target_vocab)
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, source_vocab_size, target_vocab_size): """Generate source and target data from a single file.""" filename = "parsing_{0}.pairs".format("train" if train else "dev") source_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 0, prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size) target_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 1, prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) pair_filepath = os.path.join(tmp_dir, filename) return text_problems.text2text_generate_encoded( text_problems.text2text_txt_tab_iterator(pair_filepath), source_vocab, target_vocab)
def generate_samples(self, data_dir, tmp_dir, dataset_split): data_path = self.source_data_files(dataset_split)[0] return text_problems.text2text_txt_tab_iterator(data_path)