def create_eval_model(model_creator, hparams, mode): """Create eval graph, model, src/tgt file holders, and iterator.""" graph = tf.Graph() with graph.as_default(), tf.container("eval"): # create a table to map words to vocab ids. input_vocab_table = vocab_utils.create_vocab_table(hparams.vocab_path) # define a placeholder for the input dataset. # we will dynamically initialize this placeholder with a file name during validation. # The reason for this is that during validation, we may want to evaluate our trained model on different datasets. input_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) input_dataset = tf.data.TextLineDataset(input_file_placeholder) output_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) output_dataset = tf.data.TextLineDataset(output_file_placeholder) iterator = iterator_utils.get_iterator( input_dataset, output_dataset, input_vocab_table, batch_size=hparams.eval_batch_size, random_seed=hparams.random_seed, pad=hparams.pad, input_max_len=hparams.input_max_len) model = model_creator(hparams, mode, iterator, input_vocab_table=input_vocab_table) return EvalModel(graph, model, input_file_placeholder, output_file_placeholder, iterator)
def create_train_model(hparams): train_file = hparams.train vocab_size, vocab_file = vocab_utils.check_vocab(hparams.vocab_file, hparams.out_dir, sos=hparams.sos, eos=hparams.eos, unk=vocab_utils.UNK) hparams.add_hparam("vocab_size", vocab_size) graph = tf.Graph() with graph.as_default(), tf.container("train"): vocab_table = lookup_ops.index_table_from_file(vocab_file, default_value=0) iterator = iterator_utils.get_iterator(train_file, vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, src_max_len=hparams.src_max_len) model = rnn_model.Model(hparams, mode=tf.contrib.learn.ModeKeys.TRAIN, iterator=iterator, vocab_table=vocab_table) return graph, model, iterator
def create_test_iterator(hparams, mode): """Create test iterator.""" src_vocab_table = lookup_ops.index_table_from_tensor( tf.constant([hparams.eos, "a", "b", "c", "d"])) tgt_vocab_mapping = tf.constant([hparams.sos, hparams.eos, "a", "b", "c"]) tgt_vocab_table = lookup_ops.index_table_from_tensor(tgt_vocab_mapping) if mode == tf.contrib.learn.ModeKeys.INFER: reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_tensor( tgt_vocab_mapping) src_dataset = tf.data.Dataset.from_tensor_slices( tf.constant(["a a b b c", "a b b"])) if mode != tf.contrib.learn.ModeKeys.INFER: tgt_dataset = tf.data.Dataset.from_tensor_slices( tf.constant(["a b c b c", "a b c b"])) return (iterator_utils.get_iterator( src_dataset=src_dataset, tgt_dataset=tgt_dataset, src_vocab_table=src_vocab_table, tgt_vocab_table=tgt_vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, source_reverse=hparams.source_reverse, random_seed=hparams.random_seed, num_buckets=hparams.num_buckets), src_vocab_table, tgt_vocab_table) else: return (iterator_utils.get_infer_iterator( src_dataset=src_dataset, src_vocab_table=src_vocab_table, eos=hparams.eos, source_reverse=hparams.source_reverse, batch_size=hparams.batch_size), src_vocab_table, tgt_vocab_table, reverse_tgt_vocab_table)
def create_eval_model(model_creator, hparams): src1_vocab_file = "%s/%s" % (hparams.data_dir, hparams.word_vocab) src2_vocab_file = "%s/%s" % (hparams.data_dir, hparams.word_vocab) tgt_vocab_file = "%s/%s" % (hparams.data_dir, hparams.role_vocab) graph = tf.Graph() with graph.as_default(), tf.container("eval"): src1_vocab_table, src2_vocab_table, tgt_vocab_table = create_vocab_tables( src1_vocab_file, src2_vocab_file, tgt_vocab_file) src1_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) src2_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) tgt_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) src1_dataset = tf.contrib.data.TextLineDataset(src1_file_placeholder) src2_dataset = tf.contrib.data.TextLineDataset(src2_file_placeholder) tgt_dataset = tf.contrib.data.TextLineDataset(tgt_file_placeholder) iterator = iterator_utils.get_iterator( src1_dataset, src2_dataset, tgt_dataset, src1_vocab_table, src2_vocab_table, tgt_vocab_table, hparams.batch_size, hparams.num_buckets, hparams.src_max_len, hparams.tgt_max_len) with tf.device("/cpu:0"): model = model_creator(hparams, iterator, tf.contrib.learn.ModeKeys.EVAL, src1_vocab_table, src2_vocab_table, tgt_vocab_table) return EvalModel(graph=graph, model=model, iterator=iterator, src1_file_placeholder=src1_file_placeholder, src2_file_placeholder=src2_file_placeholder, tgt_file_placeholder=tgt_file_placeholder)
def create_train_model(model_creator, hparams, scope=None, num_workers=1, jobid=0, extra_args=None): """Create train graph, model, and iterator.""" src_file = "%s.%s" % (hparams.train_prefix, hparams.src) tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt) src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file graph = tf.Graph() with graph.as_default(), tf.container(scope or "train"): src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables( src_vocab_file, tgt_vocab_file, hparams.share_vocab) src_dataset = tf.contrib.data.TextLineDataset(src_file) tgt_dataset = tf.contrib.data.TextLineDataset(tgt_file) skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64) iterator = iterator_utils.get_iterator( src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, source_reverse=hparams.source_reverse, random_seed=hparams.random_seed, num_buckets=hparams.num_buckets, src_max_len=hparams.src_max_len, tgt_max_len=hparams.tgt_max_len, skip_count=skip_count_placeholder, num_shards=num_workers, shard_index=jobid) # Note: One can set model_device_fn to # `tf.train.replica_device_setter(ps_tasks)` for distributed training. model_device_fn = None if extra_args: model_device_fn = extra_args.model_device_fn with tf.device(model_device_fn): model = model_creator(hparams, iterator=iterator, mode=tf.contrib.learn.ModeKeys.TRAIN, source_vocab_table=src_vocab_table, target_vocab_table=tgt_vocab_table, scope=scope, extra_args=extra_args) return TrainModel(graph=graph, model=model, iterator=iterator, skip_count_placeholder=skip_count_placeholder)
def create_eval_model(model_creator, hparams, scope=None, extra_args=None): """Create train graph, model, src/tgt file holders, and iterator.""" src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file lbl_vocab_file = hparams.lbl_vocab_file graph = tf.Graph() with graph.as_default(), tf.container(scope or "eval"): src_vocab_table, tgt_vocab_table, lbl_vocab_table = \ vocab_utils.create_vocab_tables(src_vocab_file, tgt_vocab_file, lbl_vocab_file, hparams.share_vocab) reverse_tgt_vocab_table = lookup_ops.index_to_string_table_from_file( tgt_vocab_file, default_value=vocab_utils.UNK) reverse_lbl_vocab_table = lookup_ops.index_to_string_table_from_file( lbl_vocab_file, default_value=vocab_utils.UNK) src_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) tgt_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) lbl_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) src_dataset = tf.data.TextLineDataset(src_file_placeholder) tgt_dataset = tf.data.TextLineDataset(tgt_file_placeholder) lbl_dataset = tf.data.TextLineDataset(lbl_file_placeholder) iterator = iterator_utils.get_iterator( src_dataset, tgt_dataset, lbl_dataset, src_vocab_table, tgt_vocab_table, lbl_vocab_table, hparams.batch_size, sos=hparams.sos, eos=hparams.eos, random_seed=hparams.random_seed, num_buckets=hparams.num_buckets, src_max_len=hparams.src_max_len_infer, tgt_max_len=hparams.tgt_max_len_infer) model = model_creator( hparams, iterator=iterator, mode=tf.contrib.learn.ModeKeys.EVAL, source_vocab_table=src_vocab_table, target_vocab_table=tgt_vocab_table, label_vocab_table=lbl_vocab_table, reverse_target_vocab_table=reverse_tgt_vocab_table, reverse_target_intent_vocab_table=reverse_lbl_vocab_table, scope=scope, extra_args=extra_args) return EvalModel(graph=graph, model=model, src_file_placeholder=src_file_placeholder, tgt_file_placeholder=tgt_file_placeholder, lbl_file_placeholder=lbl_file_placeholder, iterator=iterator)
def test_get_iteartor(self): base_path = "../model_data/iwslt15/" src_dataset_path = os.path.join(base_path, "train.en") tgt_dataset_path = os.path.join(base_path, "train.vi") src_vocab_path = os.path.join(base_path, "vocab.en") tgt_vocab_path = os.path.join(base_path, "vocab.vi") train_iterator = \ get_iterator(src_dataset_path=src_dataset_path, tgt_dataset_path=tgt_dataset_path, src_vocab_path=src_vocab_path, tgt_vocab_path=tgt_vocab_path, batch_size=128, num_buckets=1, source_reverse=True, is_shuffle=False, src_max_len=50, tgt_max_len=50) with self.test_session() as session: session.run(tf.tables_initializer()) session.run(tf.global_variables_initializer()) session.run(train_iterator.initializer) src_file = open("../test_result/src.txt", mode="a") tgt_in_file = open("../test_result/tgt_in.txt", mode="a") tgt_out_file = open("../test_result/tgt_out.txt", mode="a") src_seq_len_file = open("../test_result/src_seq_len_file.txt", mode="a") tgt_seq_len_file = open("../test_result/tgt_seq_len_file.txt", mode="a") for i in range(10000): try: src, tgt_in, tgt_out, src_seq_len, tgt_seq_len = session.run( train_iterator.get_next()) for src_sen, tgt_in_sen, tgt_out_sen, src_seq_one_len, tgt_seq_one_len in \ zip(src, tgt_in, tgt_out, src_seq_len, tgt_seq_len): src_file.write(" ".join([str(n) for n in src_sen]) + "\n") tgt_in_file.write( " ".join([str(n) for n in tgt_in_sen]) + "\n") tgt_out_file.write( " ".join([str(n) for n in tgt_out_sen]) + "\n") src_seq_len_file.write(str(src_seq_one_len) + "\n") tgt_seq_len_file.write(str(tgt_seq_one_len) + "\n") except tf.errors.OutOfRangeError: break
def create_train_model(model_creator, hparams, input_path, target_path, mode): graph = tf.Graph() with graph.as_default(), tf.container("train"): input_vocab_table = vocab_utils.create_vocab_table(hparams.vocab_path) input_dataset = tf.contrib.data.TextLineDataset(input_path) output_dataset = tf.contrib.data.TextLineDataset(target_path) iterator = iterator_utils.get_iterator( input_dataset, output_dataset, input_vocab_table, batch_size=hparams.batch_size, random_seed=hparams.random_seed, pad=hparams.pad, input_max_len=hparams.input_max_len) model = model_creator(hparams, mode, iterator, input_vocab_table=input_vocab_table, reverse_input_vocab_table=None) return TrainModel(graph, model, iterator)
def create_eval_model(model_creator, hparams, scope=None, single_cell_fn=None): """Create train graph, model, src/tgt file holders, and iterator.""" src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file graph = tf.Graph() with graph.as_default(): src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables( src_vocab_file, tgt_vocab_file, hparams.share_vocab) src_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) tgt_file_placeholder = tf.placeholder(shape=(), dtype=tf.string) src_dataset = tf.contrib.data.TextLineDataset(src_file_placeholder) tgt_dataset = tf.contrib.data.TextLineDataset(tgt_file_placeholder) iterator = iterator_utils.get_iterator( src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, hparams.batch_size, sos=hparams.sos, eos=hparams.eos, source_reverse=hparams.source_reverse, random_seed=hparams.random_seed, num_buckets=hparams.num_buckets, src_max_len=hparams.src_max_len_infer, tgt_max_len=hparams.tgt_max_len_infer) model = model_creator( hparams, iterator=iterator, mode=tf.contrib.learn.ModeKeys.EVAL, source_vocab_table=src_vocab_table, target_vocab_table=tgt_vocab_table, scope=scope, single_cell_fn=single_cell_fn) return EvalModel( graph=graph, model=model, src_file_placeholder=src_file_placeholder, tgt_file_placeholder=tgt_file_placeholder, iterator=iterator)
def create_train_model(model_creator, hparams): src1_file = "%s/%s" % (hparams.data_dir, hparams.train_word_data) src2_file = "%s/%s" % (hparams.data_dir, hparams.train_pos_data) tgt_file = "%s/%s" % (hparams.data_dir, hparams.train_role_data) src1_vocab_file = "%s/%s" % (hparams.data_dir, hparams.word_vocab) src2_vocab_file = "%s/%s" % (hparams.data_dir, hparams.pos_vocab) tgt_vocab_file = "%s/%s" % (hparams.data_dir, hparams.role_vocab) graph = tf.Graph() with graph.as_default(), tf.container("train"): src1_vocab_table, src2_vocab_table, tgt_vocab_table = create_vocab_tables( src1_vocab_file, src2_vocab_file, tgt_vocab_file) src1_dataset = tf.contrib.data.TextLineDataset(src1_file) src2_dataset = tf.contrib.data.TextLineDataset(src2_file) tgt_dataset = tf.contrib.data.TextLineDataset(tgt_file) skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64) iterator = iterator_utils.get_iterator( src1_dataset, src2_dataset, tgt_dataset, src1_vocab_table, src2_vocab_table, tgt_vocab_table, hparams.batch_size, hparams.num_buckets, hparams.src_max_len, hparams.tgt_max_len, skip_count=skip_count_placeholder) with tf.device("/cpu:0"): model = model_creator(hparams, iterator, tf.contrib.learn.ModeKeys.TRAIN, src1_vocab_table, src2_vocab_table, tgt_vocab_table) return TrainModel(graph=graph, model=model, iterator=iterator, skip_count_placeholder=skip_count_placeholder)
def create_train_model(model_creator, hparams, input_path, target_path, mode): """Create train graph, model, and iterator.""" graph = tf.Graph() with graph.as_default(), tf.container("train"): # create a table to map vocabulary words to vocab ids. input_vocab_table = vocab_utils.create_vocab_table(hparams.vocab_path) input_dataset = tf.data.TextLineDataset(input_path) output_dataset = tf.data.TextLineDataset(target_path) # create iterator over the train batches. iterator = iterator_utils.get_iterator( input_dataset, output_dataset, input_vocab_table, batch_size=hparams.batch_size, random_seed=hparams.random_seed, pad=hparams.pad, input_max_len=hparams.input_max_len) # create the actual rnn model. model = model_creator(hparams, mode, iterator, input_vocab_table=input_vocab_table) return TrainModel(graph, model, iterator)
def testGetIterator(self): tgt_vocab_table = src_vocab_table = lookup_ops.index_table_from_tensor( tf.constant(["a", "b", "c", "eos", "sos"])) src_dataset = tf.data.Dataset.from_tensor_slices( tf.constant(["f e a g", "c c a", "d", "c a"])) tgt_dataset = tf.data.Dataset.from_tensor_slices( tf.constant(["c c", "a b", "", "b c"])) hparams = tf.contrib.training.HParams(random_seed=3, num_buckets=5, source_reverse=False, eos="eos", sos="sos") batch_size = 2 src_max_len = 3 iterator = iterator_utils.get_iterator( src_dataset=src_dataset, tgt_dataset=tgt_dataset, src_vocab_table=src_vocab_table, tgt_vocab_table=tgt_vocab_table, batch_size=batch_size, sos=hparams.sos, eos=hparams.eos, source_reverse=hparams.source_reverse, random_seed=hparams.random_seed, num_buckets=hparams.num_buckets, src_max_len=src_max_len) table_initializer = tf.tables_initializer() source = iterator.source target_input = iterator.target_input target_output = iterator.target_output src_seq_len = iterator.source_sequence_length tgt_seq_len = iterator.target_sequence_length self.assertEqual([None, None], source.shape.as_list()) self.assertEqual([None, None], target_input.shape.as_list()) self.assertEqual([None, None], target_output.shape.as_list()) self.assertEqual([None], src_seq_len.shape.as_list()) self.assertEqual([None], tgt_seq_len.shape.as_list()) with self.test_session() as sess: sess.run(table_initializer) sess.run(iterator.initializer) (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (sess.run((source, src_seq_len, target_input, target_output, tgt_seq_len))) self.assertAllEqual( [ [-1, -1, 0], # "f" == unknown, "e" == unknown, a [2, 0, 3] ], # c a eos -- eos is padding source_v) self.assertAllEqual([3, 2], src_len_v) self.assertAllEqual( [ [4, 2, 2], # sos c c [4, 1, 2] ], # sos b c target_input_v) self.assertAllEqual( [ [2, 2, 3], # c c eos [1, 2, 3] ], # b c eos target_output_v) self.assertAllEqual([3, 3], tgt_len_v) (source_v, src_len_v, target_input_v, target_output_v, tgt_len_v) = (sess.run((source, src_seq_len, target_input, target_output, tgt_seq_len))) self.assertAllEqual( [[2, 2, 0]], # c c a source_v) self.assertAllEqual([3], src_len_v) self.assertAllEqual( [[4, 0, 1]], # sos a b target_input_v) self.assertAllEqual( [[0, 1, 3]], # a b eos target_output_v) self.assertAllEqual([3], tgt_len_v) with self.assertRaisesOpError("End of sequence"): sess.run(source)
def create_train_model( model_creator, hparams, scope=None, single_cell_fn=None, model_device_fn=None): """Create train graph, model, and iterator.""" src_file = "%s.%s" % (hparams.train_prefix, hparams.src) tgt_file = "%s.%s" % (hparams.train_prefix, hparams.tgt) src_vocab_file = hparams.src_vocab_file tgt_vocab_file = hparams.tgt_vocab_file graph = tf.Graph() with graph.as_default(): src_vocab_table, tgt_vocab_table = vocab_utils.create_vocab_tables( src_vocab_file, tgt_vocab_file, hparams.share_vocab) src_dataset = tf.contrib.data.TextLineDataset(src_file) tgt_dataset = tf.contrib.data.TextLineDataset(tgt_file) skip_count_placeholder = tf.placeholder(shape=(), dtype=tf.int64) if hparams.curriculum == 'none': iterator = iterator_utils.get_iterator( src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, source_reverse=hparams.source_reverse, random_seed=hparams.random_seed, num_buckets=hparams.num_buckets, src_max_len=hparams.src_max_len, tgt_max_len=hparams.tgt_max_len, skip_count=skip_count_placeholder) else: iterator = iterator_utils.get_feedable_iterator( hparams, src_dataset, tgt_dataset, src_vocab_table, tgt_vocab_table, batch_size=hparams.batch_size, sos=hparams.sos, eos=hparams.eos, source_reverse=hparams.source_reverse, random_seed=hparams.random_seed, num_buckets=hparams.num_buckets, src_max_len=hparams.src_max_len, tgt_max_len=hparams.tgt_max_len, skip_count=skip_count_placeholder) # Note: One can set model_device_fn to # `tf.train.replica_device_setter(ps_tasks)` for distributed training. with tf.device(model_device_fn): model = model_creator( hparams, iterator=iterator, mode=tf.contrib.learn.ModeKeys.TRAIN, source_vocab_table=src_vocab_table, target_vocab_table=tgt_vocab_table, scope=scope, single_cell_fn=single_cell_fn) return TrainModel( graph=graph, model=model, iterator=iterator, skip_count_placeholder=skip_count_placeholder)