def add_extra_summary_avg_bleu(hypo_tokens, hypo_len, ref_words, collections=None): vocab_i2s = vocab.get_vocab_lookup_tables()[vocab.INT_TO_STR] ref_tokens = vocab_i2s.lookup(ref_words) ref_len = length_pre_embedding(ref_words) avg_bleu = get_avg_bleu(ref_tokens, hypo_tokens, ref_len, hypo_len) tf.summary.scalar('bleu', avg_bleu, collections) return avg_bleu
def input_fn(file_path, vocab_table, batch_size, num_epochs=None, num_examples=None, seed=0, noiser=None, use_free_set=False, shuffle_input=True): vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT] pad_token = tf.constant(bytes(PAD_TOKEN, encoding='utf8'), dtype=tf.string) pad_id = vocab_table.lookup(pad_token) base_dataset = read_examples_from_file( file_path, num_examples, seed, noiser, util.get_free_words_set() if use_free_set else None) dataset_splits = [] for index in range(len(base_dataset[0])): split_dtype = infer_dtype(base_dataset[0][index]) split = tf.data.Dataset.from_generator(generator=get_generator( base_dataset, index), output_types=(split_dtype), output_shapes=(None, )) if split_dtype == tf.string: pad = pad_token else: pad = pad_id split = split.padded_batch(batch_size, padded_shapes=[None], padding_values=pad) dataset_splits.append(split) dataset = tf.data.Dataset.zip(tuple(dataset_splits)) if num_epochs and shuffle_input: dataset = dataset.apply( tf.contrib.data.shuffle_and_repeat(500, num_epochs)) elif num_epochs: dataset = dataset.repeat(num_epochs) fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat() dataset = dataset.zip((dataset, fake_label)) \ .prefetch(1) return dataset
def words2ids(words, oov=None): if oov is None: oov = [] word2id = vocab.get_vocab_lookup_tables()[vocab.RAW_WORD2ID] ids = [] for w in words: if w not in word2id: if w in oov: ids.append(len(word2id) + oov.index(w)) else: ids.append(word2id[vocab.UNKNOWN_TOKEN]) else: ids.append(word2id[w]) return ids
def input_fn(file_path, vocab_table, config, batch_size, num_epochs=None, num_examples=None, seed=0, noiser=None, use_free_set=False, shuffle_input=True): gen = read_examples_from_file( file_path, config, num_examples, seed, noiser, util.get_free_words_set() if use_free_set else None, return_gen=True ) probs = util.load_str_list(str(file_path) + '_probs') probs = [float(p) for p in probs] dataset_probs = tf.data.Dataset.from_tensor_slices( np.array(probs, dtype=np.float32).reshape((-1, 1))) dataset_probs = dataset_probs.batch(batch_size) vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT] pad_id = tf.constant(vocab.SPECIAL_TOKENS.index(PAD_TOKEN), dtype=tf.int64) dataset = tf.data.Dataset.from_generator( generator=gen, output_types=(tf.string, tf.string, tf.string, tf.string, tf.string, tf.string), output_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])) ) dataset = dataset.map(lambda *x: tuple([vocab_table.lookup(i) for i in x])) dataset = dataset.padded_batch( batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])), padding_values=tuple([pad_id] * 6) ) dataset = tf.data.Dataset.zip((dataset, dataset_probs)) if num_epochs and shuffle_input: dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(500, num_epochs)) elif num_epochs: dataset = dataset.repeat(num_epochs) fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat() dataset = dataset.zip((dataset, fake_label)) dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) return dataset
def create_oov(words): word2id = vocab.get_vocab_lookup_tables()[vocab.RAW_WORD2ID] ids = [] ids_extended = [] oov = [] for w in words: if w not in word2id: if w not in oov: oov.append(w) ids_extended.append(len(word2id) + oov.index(w)) ids.append(word2id[vocab.UNKNOWN_TOKEN]) else: ids.append(word2id[w]) ids_extended.append(word2id[w]) return ids, ids_extended, oov
def add_extra_summary_trace(pred_tokens, pred_len, base_words, output_words, src_words, tgt_words, inserted_words, deleted_words, collections=None): vocab_i2s = vocab.get_vocab_lookup_tables()[vocab.INT_TO_STR] tgt_tokens = vocab_i2s.lookup(tgt_words) tgt_len = length_pre_embedding(tgt_words) trace_summary = get_trace(pred_tokens, tgt_tokens, src_words, inserted_words, deleted_words, pred_len, tgt_len) tf.summary.text('trace', trace_summary, collections) return trace_summary
def get_trace(pred_tokens, tgt_tokens, src_words, inserted_words, deleted_words, pred_len, tgt_len): vocab_i2s = vocab.get_vocab_lookup_tables()[vocab.INT_TO_STR] if pred_tokens.shape.ndims > 2: pred_joined = metrics.join_beams(pred_tokens, pred_len) else: pred_joined = metrics.join_tokens(pred_tokens, pred_len) tgt_joined = metrics.join_tokens(tgt_tokens, tgt_len) src_joined = metrics.join_tokens(vocab_i2s.lookup(src_words), length_pre_embedding(src_words)) iw_joined = metrics.join_tokens(vocab_i2s.lookup(inserted_words), length_pre_embedding(inserted_words), ', ') dw_joined = metrics.join_tokens(vocab_i2s.lookup(deleted_words), length_pre_embedding(deleted_words), ', ') return tf.concat( [src_joined, iw_joined, dw_joined, tgt_joined, pred_joined], axis=1)
def input_fn_from_gen_multi(gen, vocab_table, batch_size, shuffle_input=False, num_epochs=None, prefetch=False): vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT] base_dataset = list(gen()) pad_id = tf.constant(vocab.SPECIAL_TOKENS.index(PAD_TOKEN), dtype=tf.int64) dataset_splits = [] for index in range(len(base_dataset[0])): split = tf.data.Dataset.from_generator(generator=get_generator( base_dataset, index), output_types=(tf.string), output_shapes=(None, )) split = split.map(lambda x: vocab_table.lookup(x)) split = split.padded_batch(batch_size, padded_shapes=[None], padding_values=(pad_id)) dataset_splits.append(split) dataset = tf.data.Dataset.zip(tuple(dataset_splits)) if num_epochs and shuffle_input: dataset = dataset.apply( tf.contrib.data.shuffle_and_repeat(500, num_epochs)) elif num_epochs: dataset = dataset.repeat(num_epochs) fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat() dataset = dataset.zip((dataset, fake_label)) if prefetch: dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) return dataset
def get_estimator(config, embed_matrix, my_model_fn=model_fn): run_config = tf.estimator.RunConfig( model_dir=config.model_dir, tf_random_seed=config.seed, save_checkpoints_steps=config.eval.save_steps, save_summary_steps=config.eval.save_summary_steps, keep_checkpoint_max=config.eval.keep_checkpoint_max, log_step_count_steps=10 ) estimator = tf.estimator.Estimator( model_fn=lambda features, labels, mode, params: my_model_fn( features, mode, params, embed_matrix, vocab.get_vocab_lookup_tables() ), model_dir=config.model_dir, config=run_config, params=config ) return estimator
def str_tokens(decoded_ids): vocab_i2s = vocab.get_vocab_lookup_tables()[vocab.INT_TO_STR] return vocab_i2s.lookup(tf.to_int64(decoded_ids))
def input_fn(file_path, vocab_table, config, batch_size, num_epochs=None, num_examples=None, seed=0, noiser=None, use_free_set=False, shuffle_input=True): gen = read_examples_from_file( file_path, config, num_examples, seed, noiser, util.get_free_words_set() if use_free_set else None, return_gen=True) # gen = lambda: iter(base_dataset) # return input_fn_from_gen_multi( # gen, # vocab_table, batch_size, # shuffle_input=shuffle_input, # num_epochs=num_epochs, # prefetch=True # ) vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT] # base_dataset = list(gen()) pad_id = tf.constant(vocab.SPECIAL_TOKENS.index(PAD_TOKEN), dtype=tf.int64) dataset = tf.data.Dataset.from_generator( generator=gen, output_types=(tf.string, tf.string, tf.string, tf.string, tf.string, tf.string), output_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]))) dataset = dataset.map(lambda *x: tuple([vocab_table.lookup(i) for i in x])) dataset = dataset.padded_batch(batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None]), tf.TensorShape([None])), padding_values=tuple([pad_id] * 6)) if num_epochs and shuffle_input: dataset = dataset.apply( tf.contrib.data.shuffle_and_repeat(500, num_epochs)) elif num_epochs: dataset = dataset.repeat(num_epochs) fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat() dataset = dataset.zip((dataset, fake_label)) dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE) return dataset