Exemple #1
0
def test_read_word_embeddings_with_special_tokens(embedding_file):
    file_name, gold_embeddings = embedding_file

    cm_vocab, cm_embeddings = vocab.read_word_embeddings(file_name, EMBED_DIM)
    assert cm_vocab == (vocab.SPECIAL_TOKENS + VOCAB)

    for i, embed in enumerate(cm_embeddings[len(vocab.SPECIAL_TOKENS):, :]):
        assert list(gold_embeddings[i]) == list(embed)
Exemple #2
0
def test_read_word_embeddings_without_special_tokens(embedding_file):
    file_name, gold_embeddings = embedding_file

    cm_vocab, cm_embeddings = vocab.read_word_embeddings(
        file_name, EMBED_DIM, include_special_tokens=False)
    assert cm_vocab == (VOCAB)

    for i, embed in enumerate(cm_embeddings):
        assert list(gold_embeddings[i]) == list(embed)
Exemple #3
0
def test_read_word_embeddings_with_special_tokens_custom_vocab_size(
        embedding_file):
    file_name, gold_embeddings = embedding_file

    vocab_size = len(VOCAB) // 2
    cm_vocab, cm_embeddings = vocab.read_word_embeddings(
        file_name, EMBED_DIM, vocab_size)
    assert cm_vocab == (vocab.SPECIAL_TOKENS + VOCAB[:vocab_size])

    for i, embed in enumerate(cm_embeddings[len(vocab.SPECIAL_TOKENS):, :]):
        assert list(gold_embeddings[i]) == list(embed)
Exemple #4
0
def test_read_word_embeddings_without_special_tokens_custom_vocab_size(
        embedding_file):
    file_name, gold_embeddings = embedding_file

    vocab_size = len(VOCAB) // 2
    cm_vocab, cm_embeddings = vocab.read_word_embeddings(
        file_name, EMBED_DIM, vocab_size, include_special_tokens=False)
    assert cm_vocab == (VOCAB[:vocab_size])

    for i, embed in enumerate(cm_embeddings):
        assert list(gold_embeddings[i]) == list(embed)
Exemple #5
0
def test_lookup_ops(embedding_file):
    fn, gold_embeds = embedding_file

    cm_vocab, _ = vocab.read_word_embeddings(fn, EMBED_DIM)

    oov_words = ['dsjii', 'disjfi']
    test_strings = vocab.SPECIAL_TOKENS + VOCAB + oov_words

    tf.enable_eager_execution()
    test_strings = tf.constant(test_strings)
    table = vocab.get_vocab_lookup(cm_vocab)
    ids = table.lookup(test_strings)
    assert list(ids.numpy()) == list(range(
        len(vocab.SPECIAL_TOKENS +
            VOCAB))) + [vocab.OOV_TOKEN_ID] * len(oov_words)
def eval(config, data_dir, checkpoint_path=None, my_model_fn=model_fn):
    V, embed_matrix = vocab.read_word_embeddings(
        data_dir / 'word_vectors' / config.editor.wvec_path,
        config.editor.word_dim,
        config.editor.vocab_size
    )

    estimator = get_estimator(config, embed_matrix, my_model_fn)

    output = estimator.evaluate(
        input_fn=lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V), num_examples=1e10),
        checkpoint_path=checkpoint_path
    )

    return output
def augment_debug(config, debug_dataset, data_dir, checkpoint_path=None, my_model_fn=model_fn):
    V, embed_matrix = vocab.read_word_embeddings(
        data_dir / 'word_vectors' / config.editor.wvec_path,
        config.editor.word_dim,
        config.editor.vocab_size
    )

    config.put('optim.batch_size', 1)

    estimator = get_estimator(config, embed_matrix, my_model_fn)

    with open(str(data_dir / debug_dataset), 'rb') as f:
        debug_examples = pickle.load(f)

    debugged = augment_debug_dataset(debug_examples, estimator, checkpoint_path, V)

    with open("%s_debugged" % debug_dataset, 'w', encoding='utf8') as f:
        json.dump(debugged, f)
def test_rnn_encoder(dataset_file, embedding_file):
    with tf.Graph().as_default():
        d_fn, gold_dataset = dataset_file
        e_fn, gold_embeds = embedding_file

        v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM)
        vocab_lookup = vocab.get_vocab_lookup(v)

        dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE,
                                         NUM_EPOCH)

        embedding = tf.get_variable(
            'embeddings',
            shape=embed_matrix.shape,
            initializer=tf.constant_initializer(embed_matrix))

        iter = dataset.make_initializable_iterator()
        (src, tgt, iw, dw), _ = iter.get_next()

        EDIT_DIM = 8
        output = ev.rnn_encoder(tf.nn.embedding_lookup(embedding, src),
                                tf.nn.embedding_lookup(embedding, tgt),
                                tf.nn.embedding_lookup(embedding, iw),
                                tf.nn.embedding_lookup(embedding, dw),
                                sequence.length_pre_embedding(src),
                                sequence.length_pre_embedding(tgt),
                                sequence.length_pre_embedding(iw),
                                sequence.length_pre_embedding(dw), 256, 2, 256,
                                1, EDIT_DIM, 100.0, 0.1, 14.0, 0.8)

        with tf.Session() as sess:
            sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer(),
                tf.tables_initializer()
            ])
            sess.run(iter.initializer)

            while True:
                try:
                    oeo = sess.run(output)
                    assert oeo.shape == (BATCH_SIZE, EDIT_DIM)
                except:
                    break
def augment_meta_test(config, meta_test_path, data_dir, checkpoint_path=None, my_model_fn=model_fn):
    V, embed_matrix = vocab.read_word_embeddings(
        data_dir / 'word_vectors' / config.editor.wvec_path,
        config.editor.word_dim,
        config.editor.vocab_size
    )

    config.put('optim.batch_size', 1)

    estimator = get_estimator(config, embed_matrix, my_model_fn)

    with open(str(data_dir / config.dataset.path / 'train.tsv'), encoding='utf8') as f:
        train_examples = []
        for l in tqdm(f, total=util.get_num_total_lines(data_dir / config.dataset.path / 'train.tsv')):
            l = l[:-1]
            src, tgt = l.split('\t')

            train_examples.append((l, util.jaccard(
                set([w.lower() for w in src.split(' ')]),
                set([w.lower() for w in tgt.split(' ')]),
            )))

        train_examples = list(filter(lambda x: 0.6 < x[1] < 0.8, train_examples))

    with open(meta_test_path, 'rb') as f:
        meta_test = pickle.load(f)

    # for i, m in enumerate(tqdm(meta_test)):
    #     dtrain = augment_dataset(train_examples, estimator, checkpoint_path, meta_test[0], V)
    #     assert len(dtrain) == len(meta_test[0]) * (1 + NUM_CANDIDATES)
    #     print("DTRRAAAAAAAAAAAAAIIIIIIIIIIIINNNNNNNN:", len(dtrain))
    #     meta_test[i] = (dtrain, meta_test[i][1], meta_test[i][2])

    new_meta_test = []
    all_dtrain = augment_dataset(train_examples, estimator, checkpoint_path, meta_test, V)
    for episode_id in all_dtrain:
        new_meta_test.append((
            all_dtrain[episode_id],
            meta_test[episode_id][1],
            meta_test[episode_id][2],
        ))

    with open(meta_test_path + '_augmented.pkl', 'wb') as f:
        pickle.dump(new_meta_test, f)
def test_wtf():
    with tf.Graph().as_default():
        V, embed_matrix = vocab.read_word_embeddings(
            Path('../data') / 'word_vectors' / 'glove.6B.300d_yelp.txt',
            300,
            10000
        )

        table = vocab.create_vocab_lookup_tables(V)
        vocab_s2i = table[vocab.STR_TO_INT]
        vocab_i2s = table[vocab.INT_TO_STR]

        dataset = input_fn('../data/yelp_dataset_large_split/train.tsv', table, 64, 1)
        iter = dataset.make_initializable_iterator()

        (src, tgt, iw, dw), _ = iter.get_next()
        src_len = length_pre_embedding(src)
        tgt_len = length_pre_embedding(tgt)
        iw_len = length_pre_embedding(iw)
        dw_len = length_pre_embedding(dw)

        dec_inputs = decoder.prepare_decoder_inputs(tgt, vocab.get_token_id(vocab.START_TOKEN, vocab_s2i))

        dec_output = decoder.prepare_decoder_output(tgt, tgt_len, vocab.get_token_id(vocab.STOP_TOKEN, vocab_s2i),
                                                    vocab.get_token_id(vocab.PAD_TOKEN, vocab_s2i))

        t_src = vocab_i2s.lookup(src)
        t_tgt = vocab_i2s.lookup(tgt)
        t_iw = vocab_i2s.lookup(iw)
        t_dw = vocab_i2s.lookup(dw)

        t_do = vocab_i2s.lookup(dec_output)
        t_di = vocab_i2s.lookup(dec_inputs)

        with tf.Session() as sess:
            sess.run([tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()])
            sess.run(iter.initializer)

            while True:
                try:
                    # src, tgt, iw, dw = sess.run([src, tgt, iw, dw])
                    ts, tt, tiw, tdw, tdo, tdi = sess.run([t_src, t_tgt, t_iw, t_dw, t_do, t_di])
                except:
                    break
def test_num_examples():
    V, embed_matrix = vocab.read_word_embeddings(
        'data/word_vectors/glove.6B.300d_dbpedia.txt',
        300,
        10000
    )

    vocab_table = vocab.create_vocab_lookup_tables(V)
    dataset = input_fn('data/quora_naug/train.tsv', vocab_table, 10, 1)
    for (features, _) in dataset:
        base_words, extended_base_words, \
        output_words, extended_output_words, \
        src_words, tgt_words, \
        inserted_words, deleted_words, \
        oov = features

        oov_len = sequence.length_string(oov, vocab.PAD_TOKEN)

        print(features)
def predict(config, data_dir, checkpoint_path=None, my_model_fn=model_fn):
    V, embed_matrix = vocab.read_word_embeddings(
        data_dir / 'word_vectors' / config.editor.wvec_path,
        config.editor.word_dim,
        config.editor.vocab_size
    )

    config.put('optim.batch_size', 1)

    estimator = get_estimator(config, embed_matrix, my_model_fn)

    output = estimator.predict(
        input_fn=lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V), num_examples=10),
        checkpoint_path=checkpoint_path
    )

    for p in output:
        print(p['joined'])

    return output
def predict_cmd(config, data_dir, checkpoint_path=None):
    V, embed_matrix = vocab.read_word_embeddings(
        data_dir / 'word_vectors' / config.editor.wvec_path,
        config.editor.word_dim,
        config.editor.vocab_size
    )

    config.put('optim.batch_size', 1)

    estimator = get_estimator(config, embed_matrix)

    output = estimator.predict(
        input_fn=lambda: input_fn_cmd(vocab.create_vocab_lookup_tables(V)),
        checkpoint_path=checkpoint_path
    )

    for p in output:
        print('\nResult:')
        print(p['joined'])

    return output
def get_vocab_embedding_matrix(config, data_dir):
    if config.editor.get('use_sub_words', False):
        if config.editor.get('use_t2t_sub_words'):
            output = vocab.read_t2t_subword_embeddings(config)
        else:
            output = vocab.read_subword_embeddings(config)
    else:
        if config.editor.get('word_vocab_file_path', None):
            vocab_path = str(config.local_data_dir / config.dataset.path / config.editor.word_vocab_file_path)
        else:
            vocab_path = None

        output = vocab.read_word_embeddings(
            data_dir / 'word_vectors' / config.editor.wvec_path,
            config.editor.word_dim,
            config.editor.vocab_size,
            random_initialization=(not config.editor.get('use_pretrained_embeddings', True)),
            vocab_file=vocab_path
        )

    return output
def test_context_encoder(dataset_file, embedding_file):
    with tf.Graph().as_default():
        d_fn, gold_dataset = dataset_file
        e_fn, gold_embeds = embedding_file

        v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM)
        vocab_lookup = vocab.get_vocab_lookup(v)

        dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE,
                                         NUM_EPOCH)

        embedding = tf.get_variable(
            'embeddings',
            shape=embed_matrix.shape,
            initializer=tf.constant_initializer(embed_matrix))

        iter = dataset.make_initializable_iterator()
        (_, _, src, _), _ = iter.get_next()

        src_len = sequence.length_pre_embedding(src)
        src_embd = tf.nn.embedding_lookup(embedding, src)

        output = ev.context_encoder(src_embd, src_len, HIDDEN_DIM, NUM_LAYER)

        with tf.Session() as sess:
            sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer(),
                tf.tables_initializer()
            ])
            sess.run(iter.initializer)

            while True:
                try:
                    oeo, o_src, o_src_len, o_src_embd = sess.run(
                        [output, src, src_len, src_embd])
                    assert oeo.shape == (BATCH_SIZE, o_src_len.max(),
                                         HIDDEN_DIM)
                except:
                    break
Exemple #16
0
def test_encoder(dataset_file, embedding_file):
    d_fn, gold_dataset = dataset_file
    e_fn, gold_embeds = embedding_file

    v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM)
    vocab_lookup = vocab.get_vocab_lookup(v)

    dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE, NUM_EPOCH)

    embedding = tf.get_variable(
        'embeddings',
        shape=embed_matrix.shape,
        initializer=tf.constant_initializer(embed_matrix))

    iter = dataset.make_initializable_iterator()
    (src, _, _, _), _ = iter.get_next()

    src_len = sequence.length_pre_embedding(src)
    src_embd = tf.nn.embedding_lookup(embedding, src)

    encoder_output, _ = encoder.bidirectional_encoder(src_embd, src_len,
                                                      HIDDEN_DIM, NUM_LAYER,
                                                      0.9)

    with tf.Session() as sess:
        sess.run([
            tf.global_variables_initializer(),
            tf.local_variables_initializer(),
            tf.tables_initializer()
        ])
        sess.run(iter.initializer)

        oeo, o_src, o_src_len, o_src_embd = sess.run(
            [encoder_output, src, src_len, src_embd])

        for i in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            print(i)

        assert oeo.shape == (BATCH_SIZE, o_src_len.max(), HIDDEN_DIM)
def test_decoder_train(dataset_file, embedding_file):
    with tf.Graph().as_default():
        d_fn, gold_dataset = dataset_file
        e_fn, gold_embeds = embedding_file

        v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM)
        vocab_lookup = vocab.get_vocab_lookup(v)

        stop_token = tf.constant(bytes(vocab.STOP_TOKEN, encoding='utf8'),
                                 dtype=tf.string)
        stop_token_id = vocab_lookup.lookup(stop_token)

        start_token = tf.constant(bytes(vocab.START_TOKEN, encoding='utf8'),
                                  dtype=tf.string)
        start_token_id = vocab_lookup.lookup(start_token)

        pad_token = tf.constant(bytes(vocab.PAD_TOKEN, encoding='utf8'),
                                dtype=tf.string)
        pad_token_id = vocab_lookup.lookup(pad_token)

        dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE,
                                         NUM_EPOCH)
        iter = dataset.make_initializable_iterator()
        (src, tgt, inw, dlw), _ = iter.get_next()

        src_len = sequence.length_pre_embedding(src)

        tgt_len = sequence.length_pre_embedding(tgt)

        dec_inputs = decoder.prepare_decoder_inputs(tgt, start_token_id)
        dec_outputs = decoder.prepare_decoder_output(tgt, tgt_len,
                                                     stop_token_id,
                                                     pad_token_id)

        dec_inputs_len = sequence.length_pre_embedding(dec_inputs)
        dec_outputs_len = sequence.length_pre_embedding(dec_outputs)

        batch_size = tf.shape(src)[0]
        edit_vector = edit_encoder.random_noise_encoder(
            batch_size, EDIT_DIM, 14.0)

        embedding = tf.get_variable(
            'embeddings',
            shape=embed_matrix.shape,
            initializer=tf.constant_initializer(embed_matrix))

        src_embd = tf.nn.embedding_lookup(embedding, src)
        src_sent_embeds, final_states = encoder.source_sent_encoder(
            src_embd, src_len, 20, 3, 0.8)

        agn = agenda.linear(final_states, edit_vector, 4)

        dec_out = decoder.train_decoder(agn, embedding, dec_inputs,
                                        src_sent_embeds,
                                        tf.nn.embedding_lookup(embedding, inw),
                                        tf.nn.embedding_lookup(embedding, dlw),
                                        dec_inputs_len, src_len,
                                        sequence.length_pre_embedding(inw),
                                        sequence.length_pre_embedding(dlw), 5,
                                        20, 3, False)

        # eval_dec_out = decoder.greedy_eval_decoder(
        #     agn, embedding,
        #     start_token_id, stop_token_id,
        #     src_sent_embeds,
        #     tf.nn.embedding_lookup(embedding, inw),
        #     tf.nn.embedding_lookup(embedding, dlw),
        #     src_len, sequence.length_pre_embedding(inw), sequence.length_pre_embedding(dlw),
        #     5, 20, 3, 40
        # )

        eval_dec_out = decoder.beam_eval_decoder(
            agn, embedding, start_token_id, stop_token_id, src_sent_embeds,
            tf.nn.embedding_lookup(embedding, inw),
            tf.nn.embedding_lookup(embedding, dlw), src_len,
            sequence.length_pre_embedding(inw),
            sequence.length_pre_embedding(dlw), 5, 20, 3, 40)

        # saver = tf.train.Saver(write_version=tf.train.SaverDef.V1)
        # s = tf.summary.FileWriter('data/an')
        # s.add_graph(g)
        #
        # all_print = tf.get_collection('print')

        an, final_states, len = dec_out
        stacked = decoder.attention_score(dec_out)

        with tf.Session() as sess:
            sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer(),
                tf.tables_initializer()
            ])
            sess.run(iter.initializer)

            print(sess.run([eval_dec_out]))
def test_decoder_prepares(dataset_file, embedding_file):
    with tf.Graph().as_default():
        d_fn, gold_dataset = dataset_file
        e_fn, gold_embeds = embedding_file

        v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM)
        vocab_lookup = vocab.get_vocab_lookup(v)

        stop_token = tf.constant(bytes(vocab.STOP_TOKEN, encoding='utf8'),
                                 dtype=tf.string)
        stop_token_id = vocab_lookup.lookup(stop_token)

        start_token = tf.constant(bytes(vocab.START_TOKEN, encoding='utf8'),
                                  dtype=tf.string)
        start_token_id = vocab_lookup.lookup(start_token)

        pad_token = tf.constant(bytes(vocab.PAD_TOKEN, encoding='utf8'),
                                dtype=tf.string)
        pad_token_id = vocab_lookup.lookup(pad_token)

        dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE,
                                         NUM_EPOCH)
        iter = dataset.make_initializable_iterator()
        (_, tgt, _, _), _ = iter.get_next()

        tgt_len = sequence.length_pre_embedding(tgt)

        dec_inputs = decoder.prepare_decoder_inputs(tgt, start_token_id)
        dec_outputs = decoder.prepare_decoder_output(tgt, tgt_len,
                                                     stop_token_id,
                                                     pad_token_id)

        dec_inputs_len = sequence.length_pre_embedding(dec_inputs)
        dec_outputs_len = sequence.length_pre_embedding(dec_outputs)

        dec_outputs_last = sequence.last_relevant(
            tf.expand_dims(dec_outputs, 2), dec_outputs_len)
        dec_outputs_last = tf.squeeze(dec_outputs_last)

        with tf.Session() as sess:
            sess.run([
                tf.global_variables_initializer(),
                tf.local_variables_initializer(),
                tf.tables_initializer()
            ])
            sess.run(iter.initializer)

            while True:
                try:
                    dec_inputs, dec_outputs, tgt_len, dil, dol, start_token_id, stop_token_id, dec_outputs_last, tgt = sess.run(
                        [
                            dec_inputs, dec_outputs, tgt_len, dec_inputs_len,
                            dec_outputs_len, start_token_id, stop_token_id,
                            dec_outputs_last, tgt
                        ])

                    assert list(dil) == list(dol) == list(tgt_len + 1)
                    assert list(dec_inputs[:, 0]) == list(
                        np.ones_like(dec_inputs[:, 0]) * start_token_id)
                    assert list(dec_outputs_last) == list(
                        np.ones_like(dec_outputs_last) * stop_token_id)
                except:
                    break