def tabbed_parsing_character_generator(tmp_dir, train): """Generate source and target data from a single file.""" character_vocab = text_encoder.ByteTextEncoder() filename = "parsing_{0}.pairs".format("train" if train else "dev") pair_filepath = os.path.join(tmp_dir, filename) return tabbed_generator(pair_filepath, character_vocab, character_vocab, EOS)
def tabbed_parsing_token_generator(data_dir, tmp_dir, train, prefix, source_vocab_size, target_vocab_size): """Generate source and target data from a single file.""" filename = "parsing_{0}.pairs".format("train" if train else "dev") source_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 0, prefix + "_source.tokens.vocab.%d" % source_vocab_size, source_vocab_size) target_vocab = generator_utils.get_or_generate_tabbed_vocab( data_dir, tmp_dir, filename, 1, prefix + "_target.tokens.vocab.%d" % target_vocab_size, target_vocab_size) pair_filepath = os.path.join(tmp_dir, filename) return tabbed_generator(pair_filepath, source_vocab, target_vocab, EOS)