def test_with_counts(self): vocab_list = ["Hello", ".", "笑"] vocab_counts = [100, 200, 300] vocab_file = test_utils.create_temporary_vocab_file(vocab_list, vocab_counts) vocab_to_id_table, id_to_vocab_table, word_to_count_table, vocab_size = \ vocab.create_vocabulary_lookup_table(vocab_file.name) self.assertEqual(vocab_size, 6) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) ids = vocab_to_id_table.lookup( tf.convert_to_tensor(["Hello", ".", "笑", "??", "xxx"])) ids = sess.run(ids) np.testing.assert_array_equal(ids, [0, 1, 2, 3, 3]) words = id_to_vocab_table.lookup( tf.convert_to_tensor( [0, 1, 2, 3], dtype=tf.int64)) words = sess.run(words) np.testing.assert_array_equal( np.char.decode(words.astype("S"), "utf-8"), ["Hello", ".", "笑", "UNK"]) counts = word_to_count_table.lookup( tf.convert_to_tensor(["Hello", ".", "笑", "??", "xxx"])) counts = sess.run(counts) np.testing.assert_array_equal(counts, [100, 200, 300, -1, -1])
def setUp(self): super(EncoderDecoderTests, self).setUp() tf.logging.set_verbosity(tf.logging.INFO) self.batch_size = 4 self.input_depth = 32 self.max_decode_length = 40 # Create vocabulary self.vocab_size = 100 self.vocab_list = [str(_) for _ in range(self.vocab_size)] self.vocab_file = test_utils.create_temporary_vocab_file( self.vocab_list) self.vocab_info = inputs.get_vocab_info(self.vocab_file.name)
def setUp(self): super(EncoderDecoderTests, self).setUp() tf.logging.set_verbosity(tf.logging.INFO) self.batch_size = 2 self.input_depth = 4 self.sequence_length = 10 # Create vocabulary self.vocab_list = [str(_) for _ in range(10)] self.vocab_list += ["笑う", "泣く", "了解", "はい", "^_^"] self.vocab_size = len(self.vocab_list) self.vocab_file = test_utils.create_temporary_vocab_file(self.vocab_list) self.vocab_info = vocab.get_vocab_info(self.vocab_file.name)
def setUp(self): super(EncoderDecoderTests, self).setUp() tf.logging.set_verbosity(tf.logging.INFO) self.batch_size = 2 self.input_depth = 4 self.sequence_length = 10 # Create vocabulary self.vocab_list = [str(_) for _ in range(10)] self.vocab_list += ["笑う", "泣く", "了解", "はい", "^_^"] self.vocab_size = len(self.vocab_list) self.vocab_file = test_utils.create_temporary_vocab_file(self.vocab_list) self.vocab_info = vocab.get_vocab_info(self.vocab_file.name) tf.contrib.framework.get_or_create_global_step()
def test_train_infer(self): """Tests training and inference scripts. """ # Create dummy data sources_train, targets_train = test_utils.create_temp_parallel_data( sources=["a a a a", "b b b b", "c c c c", "笑 笑 笑 笑"], targets=["b b b b", "a a a a", "c c c c", "泣 泣 泣 泣"]) sources_dev, targets_dev = test_utils.create_temp_parallel_data( sources=["a a", "b b", "c c c", "笑 笑 笑"], targets=["b b", "a a", "c c c", "泣 泣 泣"]) vocab_source = test_utils.create_temporary_vocab_file(["a", "b", "c", "笑"]) vocab_target = test_utils.create_temporary_vocab_file(["a", "b", "c", "泣"]) _clear_flags() tf.reset_default_graph() train_script = imp.load_source("seq2seq.test.train_bin", os.path.join(BIN_FOLDER, "train.py")) # Set training flags tf.app.flags.FLAGS.output_dir = self.output_dir tf.app.flags.FLAGS.hooks = """ - class: PrintModelAnalysisHook - class: MetadataCaptureHook - class: TrainSampleHook """ tf.app.flags.FLAGS.metrics = """ - class: LogPerplexityMetricSpec - class: BleuMetricSpec - class: RougeMetricSpec params: rouge_type: rouge_1/f_score """ tf.app.flags.FLAGS.model = "AttentionSeq2Seq" tf.app.flags.FLAGS.model_params = """ attention.params: num_units: 10 vocab_source: {} vocab_target: {} """.format(vocab_source.name, vocab_target.name) tf.app.flags.FLAGS.batch_size = 2 # We pass a few flags via a config file config_path = os.path.join(self.output_dir, "train_config.yml") with gfile.GFile(config_path, "w") as config_file: yaml.dump({ "input_pipeline_train": { "class": "ParallelTextInputPipeline", "params": { "source_files": [sources_train.name], "target_files": [targets_train.name], } }, "input_pipeline_dev": { "class": "ParallelTextInputPipeline", "params": { "source_files": [sources_dev.name], "target_files": [targets_dev.name], } }, "train_steps": 50, "model_params": { "embedding.dim": 10, "decoder.params": { "rnn_cell": { "cell_class": "GRUCell", "cell_params": { "num_units": 8 } } }, "encoder.params": { "rnn_cell": { "cell_class": "GRUCell", "cell_params": { "num_units": 8 } } } } }, config_file) tf.app.flags.FLAGS.config_paths = config_path # Run training tf.logging.set_verbosity(tf.logging.INFO) train_script.main([]) # Make sure a checkpoint was written expected_checkpoint = os.path.join(self.output_dir, "model.ckpt-50.data-00000-of-00001") self.assertTrue(os.path.exists(expected_checkpoint)) # Reset flags and import inference script _clear_flags() tf.reset_default_graph() infer_script = imp.load_source("seq2seq.test.infer_bin", os.path.join(BIN_FOLDER, "infer.py")) # Set inference flags attention_dir = os.path.join(self.output_dir, "att") tf.app.flags.FLAGS.model_dir = self.output_dir tf.app.flags.FLAGS.input_pipeline = """ class: ParallelTextInputPipeline params: source_files: - {} target_files: - {} """.format(sources_dev.name, targets_dev.name) tf.app.flags.FLAGS.batch_size = 2 tf.app.flags.FLAGS.checkpoint_path = os.path.join(self.output_dir, "model.ckpt-50") # Use DecodeText Task tf.app.flags.FLAGS.tasks = """ - class: DecodeText - class: DumpAttention params: output_dir: {} """.format(attention_dir) # Make sure inference runs successfully infer_script.main([]) # Make sure attention scores and visualizations exist self.assertTrue( os.path.exists(os.path.join(attention_dir, "attention_scores.npz"))) self.assertTrue(os.path.exists(os.path.join(attention_dir, "00002.png"))) # Load attention scores and assert shape scores = np.load(os.path.join(attention_dir, "attention_scores.npz")) self.assertIn("arr_0", scores) self.assertEqual(scores["arr_0"].shape[1], 3) self.assertIn("arr_1", scores) self.assertEqual(scores["arr_1"].shape[1], 3) self.assertIn("arr_2", scores) self.assertEqual(scores["arr_2"].shape[1], 4) self.assertIn("arr_3", scores) self.assertEqual(scores["arr_3"].shape[1], 4) # Test inference with beam search _clear_flags() tf.reset_default_graph() infer_script = imp.load_source("seq2seq.test.infer_bin", os.path.join(BIN_FOLDER, "infer.py")) # Set inference flags tf.app.flags.FLAGS.model_dir = self.output_dir tf.app.flags.FLAGS.input_pipeline = """ class: ParallelTextInputPipeline params: source_files: - {} target_files: - {} """.format(sources_dev.name, targets_dev.name) tf.app.flags.FLAGS.batch_size = 2 tf.app.flags.FLAGS.checkpoint_path = os.path.join(self.output_dir, "model.ckpt-50") tf.app.flags.FLAGS.model_params = """ inference.beam_search.beam_width: 5 """ tf.app.flags.FLAGS.tasks = """ - class: DecodeText params: postproc_fn: seq2seq.data.postproc.decode_sentencepiece - class: DumpBeams params: file: {} """.format(os.path.join(self.output_dir, "beams.npz")) # Run inference w/ beam search infer_script.main([]) self.assertTrue(os.path.exists(os.path.join(self.output_dir, "beams.npz")))
def setUp(self): super(CreateVocabularyLookupTableTest, self).setUp() tf.logging.set_verbosity(tf.logging.INFO) self.vocab_list = ["Hello", ".", "笑"] self.vocab_file = test_utils.create_temporary_vocab_file( self.vocab_list)
def setUp(self): super(VocabInfoTest, self).setUp() tf.logging.set_verbosity(tf.logging.INFO) self.vocab_list = ["Hello", ".", "Bye"] self.vocab_file = test_utils.create_temporary_vocab_file( self.vocab_list)
def test_train_infer(self): """Tests training and inference scripts. """ # Create dummy data sources_train, targets_train = test_utils.create_temp_parallel_data( sources=["a a a a", "b b b b", "c c c c", "笑 笑 笑 笑"], targets=["b b b b", "a a a a", "c c c c", "泣 泣 泣 泣"]) sources_dev, targets_dev = test_utils.create_temp_parallel_data( sources=["a a", "b b", "c c c", "笑 笑 笑"], targets=["b b", "a a", "c c c", "泣 泣 泣"]) vocab_source = test_utils.create_temporary_vocab_file(["a", "b", "c", "笑"]) vocab_target = test_utils.create_temporary_vocab_file(["a", "b", "c", "泣"]) _clear_flags() tf.reset_default_graph() train_script = imp.load_source("seq2seq.test.train_bin", os.path.join(BIN_FOLDER, "train.py")) # Set training flags tf.app.flags.FLAGS.output_dir = self.output_dir tf.app.flags.FLAGS.hooks = """ - class: PrintModelAnalysisHook - class: MetadataCaptureHook - class: TrainSampleHook """ tf.app.flags.FLAGS.metrics = """ - class: LogPerplexityMetricSpec - class: BleuMetricSpec - class: RougeMetricSpec params: rouge_type: rouge_1/f_score """ tf.app.flags.FLAGS.model = "AttentionSeq2Seq" tf.app.flags.FLAGS.model_params = """ attention.params: num_units: 10 vocab_source: {} vocab_target: {} """.format(vocab_source.name, vocab_target.name) tf.app.flags.FLAGS.batch_size = 2 # We pass a few flags via a config file config_path = os.path.join(self.output_dir, "train_config.yml") with gfile.GFile(config_path, "w") as config_file: yaml.dump({ "input_pipeline_train": { "class": "ParallelTextInputPipeline", "params": { "source_files": [sources_train.name], "target_files": [targets_train.name], } }, "input_pipeline_dev": { "class": "ParallelTextInputPipeline", "params": { "source_files": [sources_dev.name], "target_files": [targets_dev.name], } }, "train_steps": 50, "model_params": { "embedding.dim": 10, "decoder.params": { "rnn_cell": { "cell_class": "GRUCell", "cell_params": { "num_units": 8 } } }, "encoder.params": { "rnn_cell": { "cell_class": "GRUCell", "cell_params": { "num_units": 8 } } } } }, config_file) tf.app.flags.FLAGS.config_paths = config_path # Run training tf.logging.set_verbosity(tf.logging.INFO) train_script.main([]) # Make sure a checkpoint was written expected_checkpoint = os.path.join(self.output_dir, "model.ckpt-50.data-00000-of-00001") self.assertTrue(os.path.exists(expected_checkpoint)) # Reset flags and import inference script _clear_flags() tf.reset_default_graph() infer_script = imp.load_source("seq2seq.test.infer_bin", os.path.join(BIN_FOLDER, "infer.py")) # Set inference flags attention_dir = os.path.join(self.output_dir, "att") tf.app.flags.FLAGS.model_dir = self.output_dir tf.app.flags.FLAGS.input_pipeline = """ class: ParallelTextInputPipeline params: source_files: - {} target_files: - {} """.format(sources_dev.name, targets_dev.name) tf.app.flags.FLAGS.batch_size = 2 tf.app.flags.FLAGS.checkpoint_path = os.path.join(self.output_dir, "model.ckpt-50") # Use DecodeText Task tf.app.flags.FLAGS.tasks = """ - class: DecodeText - class: DumpAttention params: output_dir: {} """.format(attention_dir) # Make sure inference runs successfully infer_script.main([]) # Make sure attention scores and visualizations exist self.assertTrue( os.path.exists(os.path.join(attention_dir, "attention_scores.npz"))) self.assertTrue(os.path.exists(os.path.join(attention_dir, "00002.png"))) # Load attention scores and assert shape scores = np.load(os.path.join(attention_dir, "attention_scores.npz")) self.assertIn("arr_0", scores) self.assertEqual(scores["arr_0"].shape[1], 3) self.assertIn("arr_1", scores) self.assertEqual(scores["arr_1"].shape[1], 3) self.assertIn("arr_2", scores) self.assertEqual(scores["arr_2"].shape[1], 4) self.assertIn("arr_3", scores) self.assertEqual(scores["arr_3"].shape[1], 4) # Test inference with beam search _clear_flags() tf.reset_default_graph() infer_script = imp.load_source("seq2seq.test.infer_bin", os.path.join(BIN_FOLDER, "infer.py")) # Set inference flags tf.app.flags.FLAGS.model_dir = self.output_dir tf.app.flags.FLAGS.input_pipeline = """ class: ParallelTextInputPipeline params: source_files: - {} target_files: - {} """.format(sources_dev.name, targets_dev.name) tf.app.flags.FLAGS.batch_size = 2 tf.app.flags.FLAGS.checkpoint_path = os.path.join(self.output_dir, "model.ckpt-50") tf.app.flags.FLAGS.model_params = """ inference.beam_search.beam_width: 5 """ tf.app.flags.FLAGS.tasks = """ - class: DecodeText params: postproc_fn: seq2seq.data.postproc.decode_sentencepiece - class: DumpBeams params: file: {} """.format(os.path.join(self.output_dir, "beams.npz")) # Run inference w/ beam search infer_script.main([]) self.assertTrue(os.path.exists(os.path.join(self.output_dir, "beams.npz")))
def test_copy_gen_model(source_path=None, target_path=None, vocab_path=None): tf.logging.set_verbosity(tf.logging.INFO) batch_size = 2 input_depth = 4 sequence_length = 10 if vocab_path is None: # Create vocabulary vocab_list = [str(_) for _ in range(10)] vocab_list += ["笑う", "泣く", "了解", "はい", "^_^"] vocab_size = len(vocab_list) vocab_file = test_utils.create_temporary_vocab_file(vocab_list) vocab_info = vocab.get_vocab_info(vocab_file.name) vocab_path = vocab_file.name tf.logging.info(vocab_file.name) else: vocab_info = vocab.get_vocab_info(vocab_path) vocab_list = get_vocab_list(vocab_path) extend_vocab = vocab_list + ["中国", "爱", "你"] tf.contrib.framework.get_or_create_global_step() source_len = sequence_length + 5 target_len = sequence_length + 10 source = " ".join(np.random.choice(extend_vocab, source_len)) target = " ".join(np.random.choice(extend_vocab, target_len)) is_tmp_file = False if source_path is None and target_path is None: is_tmp_file = True sources_file, targets_file = test_utils.create_temp_parallel_data( sources=[source], targets=[target]) source_path = sources_file.name target_path = targets_file.name # Build model graph mode = tf.contrib.learn.ModeKeys.TRAIN params_ = CopyGenSeq2Seq.default_params().copy() params_.update({ "vocab_source": vocab_path, "vocab_target": vocab_path, }) model = CopyGenSeq2Seq(params=params_, mode=mode) tf.logging.info(source_path) tf.logging.info(target_path) input_pipeline_ = input_pipeline.ParallelTextInputPipeline(params={ "source_files": [source_path], "target_files": [target_path] }, mode=mode) input_fn = training_utils.create_input_fn(pipeline=input_pipeline_, batch_size=batch_size) features, labels = input_fn() fetches = model(features, labels, None) fetches = [_ for _ in fetches if _ is not None] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) sess.run(tf.tables_initializer()) with tf.contrib.slim.queues.QueueRunners(sess): fetches_ = sess.run(fetches) if is_tmp_file: sources_file.close() targets_file.close() return model, fetches_
def test_train_infer(self): """Tests training and inference scripts. """ # Create dummy data sources_train, targets_train = test_utils.create_temp_parallel_data( sources=["a a a a", "b b b b", "c c c c", "笑 笑 笑 笑"], targets=["b b b b", "a a a a", "c c c c", "泣 泣 泣 泣"]) sources_dev, targets_dev = test_utils.create_temp_parallel_data( sources=["a a", "b b", "c c c", "笑 笑 笑"], targets=["b b", "a a", "c c c", "泣 泣 泣"]) vocab_source = test_utils.create_temporary_vocab_file( ["a", "b", "c", "笑"]) vocab_target = test_utils.create_temporary_vocab_file( ["a", "b", "c", "泣"]) _clear_flags() tf.reset_default_graph() train_script = imp.load_source("seq2seq.test.train_bin", os.path.join(BIN_FOLDER, "train.py")) # Set training flags tf.app.flags.FLAGS.output_dir = self.output_dir tf.app.flags.FLAGS.train_source = sources_train.name tf.app.flags.FLAGS.train_target = targets_train.name tf.app.flags.FLAGS.vocab_source = vocab_source.name tf.app.flags.FLAGS.vocab_target = vocab_target.name tf.app.flags.FLAGS.model = "AttentionSeq2Seq" tf.app.flags.FLAGS.batch_size = 2 # We pass a few flags via a config file config_path = os.path.join(self.output_dir, "train_config.yml") with gfile.GFile(config_path, "w") as config_file: yaml.dump( { "dev_source": sources_dev.name, "dev_target": targets_dev.name, "train_steps": 50, "hparams": { "embedding.dim": 64, "attention.dim": 16, "decoder.rnn_cell.cell_spec": { "class": "GRUCell", "num_units": 32 } } }, config_file) tf.app.flags.FLAGS.config_path = config_path # Run training tf.logging.set_verbosity(tf.logging.INFO) train_script.main([]) # Make sure a checkpoint was written expected_checkpoint = os.path.join( self.output_dir, "model.ckpt-50.data-00000-of-00001") self.assertTrue(os.path.exists(expected_checkpoint)) # Reset flags and import inference script _clear_flags() tf.reset_default_graph() infer_script = imp.load_source("seq2seq.test.infer_bin", os.path.join(BIN_FOLDER, "infer.py")) # Set inference flags attention_dir = os.path.join(self.output_dir, "att") tf.app.flags.FLAGS.model_dir = self.output_dir tf.app.flags.FLAGS.source = sources_dev.name tf.app.flags.FLAGS.batch_size = 2 tf.app.flags.FLAGS.checkpoint_path = os.path.join( self.output_dir, "model.ckpt-50") tf.app.flags.FLAGS.dump_attention_dir = attention_dir # Make sure inference runs successfully infer_script.main([]) # Make sure attention scores and visualizations exist self.assertTrue( os.path.exists(os.path.join(attention_dir, "attention_scores.npz"))) self.assertTrue( os.path.exists(os.path.join(attention_dir, "00002.png"))) # Load attention scores and assert shape scores = np.load(os.path.join(attention_dir, "attention_scores.npz")) self.assertIn("arr_0", scores) self.assertEqual(scores["arr_0"].shape[1], 3) self.assertIn("arr_1", scores) self.assertEqual(scores["arr_1"].shape[1], 3) self.assertIn("arr_2", scores) self.assertEqual(scores["arr_2"].shape[1], 4) self.assertIn("arr_3", scores) self.assertEqual(scores["arr_3"].shape[1], 4)
def setUp(self): super(VocabInfoTest, self).setUp() tf.logging.set_verbosity(tf.logging.INFO) self.vocab_list = ["Hello", ".", "Bye"] self.vocab_file = test_utils.create_temporary_vocab_file(self.vocab_list)