Esempio n. 1
0
def add_extra_summary_avg_bleu(hypo_tokens,
                               hypo_len,
                               ref_words,
                               collections=None):
    vocab_i2s = vocab.get_vocab_lookup_tables()[vocab.INT_TO_STR]

    ref_tokens = vocab_i2s.lookup(ref_words)
    ref_len = length_pre_embedding(ref_words)

    avg_bleu = get_avg_bleu(ref_tokens, hypo_tokens, ref_len, hypo_len)
    tf.summary.scalar('bleu', avg_bleu, collections)

    return avg_bleu
Esempio n. 2
0
def input_fn(file_path,
             vocab_table,
             batch_size,
             num_epochs=None,
             num_examples=None,
             seed=0,
             noiser=None,
             use_free_set=False,
             shuffle_input=True):
    vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT]

    pad_token = tf.constant(bytes(PAD_TOKEN, encoding='utf8'), dtype=tf.string)
    pad_id = vocab_table.lookup(pad_token)

    base_dataset = read_examples_from_file(
        file_path, num_examples, seed, noiser,
        util.get_free_words_set() if use_free_set else None)

    dataset_splits = []
    for index in range(len(base_dataset[0])):
        split_dtype = infer_dtype(base_dataset[0][index])

        split = tf.data.Dataset.from_generator(generator=get_generator(
            base_dataset, index),
                                               output_types=(split_dtype),
                                               output_shapes=(None, ))

        if split_dtype == tf.string:
            pad = pad_token
        else:
            pad = pad_id

        split = split.padded_batch(batch_size,
                                   padded_shapes=[None],
                                   padding_values=pad)

        dataset_splits.append(split)

    dataset = tf.data.Dataset.zip(tuple(dataset_splits))
    if num_epochs and shuffle_input:
        dataset = dataset.apply(
            tf.contrib.data.shuffle_and_repeat(500, num_epochs))
    elif num_epochs:
        dataset = dataset.repeat(num_epochs)

    fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat()

    dataset = dataset.zip((dataset, fake_label)) \
        .prefetch(1)

    return dataset
Esempio n. 3
0
def words2ids(words, oov=None):
    if oov is None:
        oov = []

    word2id = vocab.get_vocab_lookup_tables()[vocab.RAW_WORD2ID]

    ids = []
    for w in words:
        if w not in word2id:
            if w in oov:
                ids.append(len(word2id) + oov.index(w))
            else:
                ids.append(word2id[vocab.UNKNOWN_TOKEN])
        else:
            ids.append(word2id[w])

    return ids
Esempio n. 4
0
def input_fn(file_path, vocab_table, config, batch_size, num_epochs=None, num_examples=None, seed=0, noiser=None,
             use_free_set=False, shuffle_input=True):
    gen = read_examples_from_file(
        file_path, config, num_examples, seed,
        noiser, util.get_free_words_set() if use_free_set else None, return_gen=True
    )

    probs = util.load_str_list(str(file_path) + '_probs')
    probs = [float(p) for p in probs]
    dataset_probs = tf.data.Dataset.from_tensor_slices(
        np.array(probs, dtype=np.float32).reshape((-1, 1)))
    dataset_probs = dataset_probs.batch(batch_size)

    vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT]

    pad_id = tf.constant(vocab.SPECIAL_TOKENS.index(PAD_TOKEN), dtype=tf.int64)

    dataset = tf.data.Dataset.from_generator(
        generator=gen,
        output_types=(tf.string, tf.string, tf.string, tf.string, tf.string, tf.string),
        output_shapes=(tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None]))
    )
    dataset = dataset.map(lambda *x: tuple([vocab_table.lookup(i) for i in x]))
    dataset = dataset.padded_batch(
        batch_size,
        padded_shapes=(tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None])),
        padding_values=tuple([pad_id] * 6)
    )

    dataset = tf.data.Dataset.zip((dataset, dataset_probs))

    if num_epochs and shuffle_input:
        dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(500, num_epochs))
    elif num_epochs:
        dataset = dataset.repeat(num_epochs)

    fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat()

    dataset = dataset.zip((dataset, fake_label))
    dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

    return dataset
Esempio n. 5
0
def create_oov(words):
    word2id = vocab.get_vocab_lookup_tables()[vocab.RAW_WORD2ID]

    ids = []
    ids_extended = []
    oov = []

    for w in words:
        if w not in word2id:
            if w not in oov:
                oov.append(w)

            ids_extended.append(len(word2id) + oov.index(w))
            ids.append(word2id[vocab.UNKNOWN_TOKEN])
        else:
            ids.append(word2id[w])
            ids_extended.append(word2id[w])

    return ids, ids_extended, oov
Esempio n. 6
0
def add_extra_summary_trace(pred_tokens,
                            pred_len,
                            base_words,
                            output_words,
                            src_words,
                            tgt_words,
                            inserted_words,
                            deleted_words,
                            collections=None):
    vocab_i2s = vocab.get_vocab_lookup_tables()[vocab.INT_TO_STR]

    tgt_tokens = vocab_i2s.lookup(tgt_words)
    tgt_len = length_pre_embedding(tgt_words)

    trace_summary = get_trace(pred_tokens, tgt_tokens, src_words,
                              inserted_words, deleted_words, pred_len, tgt_len)
    tf.summary.text('trace', trace_summary, collections)

    return trace_summary
Esempio n. 7
0
def get_trace(pred_tokens, tgt_tokens, src_words, inserted_words,
              deleted_words, pred_len, tgt_len):
    vocab_i2s = vocab.get_vocab_lookup_tables()[vocab.INT_TO_STR]

    if pred_tokens.shape.ndims > 2:
        pred_joined = metrics.join_beams(pred_tokens, pred_len)
    else:
        pred_joined = metrics.join_tokens(pred_tokens, pred_len)

    tgt_joined = metrics.join_tokens(tgt_tokens, tgt_len)
    src_joined = metrics.join_tokens(vocab_i2s.lookup(src_words),
                                     length_pre_embedding(src_words))
    iw_joined = metrics.join_tokens(vocab_i2s.lookup(inserted_words),
                                    length_pre_embedding(inserted_words), ', ')
    dw_joined = metrics.join_tokens(vocab_i2s.lookup(deleted_words),
                                    length_pre_embedding(deleted_words), ', ')

    return tf.concat(
        [src_joined, iw_joined, dw_joined, tgt_joined, pred_joined], axis=1)
Esempio n. 8
0
def input_fn_from_gen_multi(gen,
                            vocab_table,
                            batch_size,
                            shuffle_input=False,
                            num_epochs=None,
                            prefetch=False):
    vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT]
    base_dataset = list(gen())

    pad_id = tf.constant(vocab.SPECIAL_TOKENS.index(PAD_TOKEN), dtype=tf.int64)

    dataset_splits = []
    for index in range(len(base_dataset[0])):
        split = tf.data.Dataset.from_generator(generator=get_generator(
            base_dataset, index),
                                               output_types=(tf.string),
                                               output_shapes=(None, ))
        split = split.map(lambda x: vocab_table.lookup(x))
        split = split.padded_batch(batch_size,
                                   padded_shapes=[None],
                                   padding_values=(pad_id))

        dataset_splits.append(split)

    dataset = tf.data.Dataset.zip(tuple(dataset_splits))
    if num_epochs and shuffle_input:
        dataset = dataset.apply(
            tf.contrib.data.shuffle_and_repeat(500, num_epochs))
    elif num_epochs:
        dataset = dataset.repeat(num_epochs)

    fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat()

    dataset = dataset.zip((dataset, fake_label))
    if prefetch:
        dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

    return dataset
Esempio n. 9
0
def get_estimator(config, embed_matrix, my_model_fn=model_fn):
    run_config = tf.estimator.RunConfig(
        model_dir=config.model_dir,
        tf_random_seed=config.seed,
        save_checkpoints_steps=config.eval.save_steps,
        save_summary_steps=config.eval.save_summary_steps,
        keep_checkpoint_max=config.eval.keep_checkpoint_max,
        log_step_count_steps=10
    )

    estimator = tf.estimator.Estimator(
        model_fn=lambda features, labels, mode, params: my_model_fn(
            features,
            mode,
            params,
            embed_matrix,
            vocab.get_vocab_lookup_tables()
        ),
        model_dir=config.model_dir,
        config=run_config,
        params=config
    )

    return estimator
Esempio n. 10
0
def str_tokens(decoded_ids):
    vocab_i2s = vocab.get_vocab_lookup_tables()[vocab.INT_TO_STR]
    return vocab_i2s.lookup(tf.to_int64(decoded_ids))
Esempio n. 11
0
def input_fn(file_path,
             vocab_table,
             config,
             batch_size,
             num_epochs=None,
             num_examples=None,
             seed=0,
             noiser=None,
             use_free_set=False,
             shuffle_input=True):
    gen = read_examples_from_file(
        file_path,
        config,
        num_examples,
        seed,
        noiser,
        util.get_free_words_set() if use_free_set else None,
        return_gen=True)

    # gen = lambda: iter(base_dataset)

    # return input_fn_from_gen_multi(
    #     gen,
    #     vocab_table, batch_size,
    #     shuffle_input=shuffle_input,
    #     num_epochs=num_epochs,
    #     prefetch=True
    # )

    vocab_table = vocab.get_vocab_lookup_tables()[vocab.STR_TO_INT]
    # base_dataset = list(gen())

    pad_id = tf.constant(vocab.SPECIAL_TOKENS.index(PAD_TOKEN), dtype=tf.int64)

    dataset = tf.data.Dataset.from_generator(
        generator=gen,
        output_types=(tf.string, tf.string, tf.string, tf.string, tf.string,
                      tf.string),
        output_shapes=(tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None]),
                       tf.TensorShape([None]), tf.TensorShape([None])))
    dataset = dataset.map(lambda *x: tuple([vocab_table.lookup(i) for i in x]))
    dataset = dataset.padded_batch(batch_size,
                                   padded_shapes=(tf.TensorShape([None]),
                                                  tf.TensorShape([None]),
                                                  tf.TensorShape([None]),
                                                  tf.TensorShape([None]),
                                                  tf.TensorShape([None]),
                                                  tf.TensorShape([None])),
                                   padding_values=tuple([pad_id] * 6))

    if num_epochs and shuffle_input:
        dataset = dataset.apply(
            tf.contrib.data.shuffle_and_repeat(500, num_epochs))
    elif num_epochs:
        dataset = dataset.repeat(num_epochs)

    fake_label = tf.data.Dataset.from_tensor_slices(tf.constant([0])).repeat()

    dataset = dataset.zip((dataset, fake_label))
    dataset = dataset.prefetch(buffer_size=tf.contrib.data.AUTOTUNE)

    return dataset