def test_read_word_embeddings_with_special_tokens(embedding_file): file_name, gold_embeddings = embedding_file cm_vocab, cm_embeddings = vocab.read_word_embeddings(file_name, EMBED_DIM) assert cm_vocab == (vocab.SPECIAL_TOKENS + VOCAB) for i, embed in enumerate(cm_embeddings[len(vocab.SPECIAL_TOKENS):, :]): assert list(gold_embeddings[i]) == list(embed)
def test_read_word_embeddings_without_special_tokens(embedding_file): file_name, gold_embeddings = embedding_file cm_vocab, cm_embeddings = vocab.read_word_embeddings( file_name, EMBED_DIM, include_special_tokens=False) assert cm_vocab == (VOCAB) for i, embed in enumerate(cm_embeddings): assert list(gold_embeddings[i]) == list(embed)
def test_read_word_embeddings_with_special_tokens_custom_vocab_size( embedding_file): file_name, gold_embeddings = embedding_file vocab_size = len(VOCAB) // 2 cm_vocab, cm_embeddings = vocab.read_word_embeddings( file_name, EMBED_DIM, vocab_size) assert cm_vocab == (vocab.SPECIAL_TOKENS + VOCAB[:vocab_size]) for i, embed in enumerate(cm_embeddings[len(vocab.SPECIAL_TOKENS):, :]): assert list(gold_embeddings[i]) == list(embed)
def test_read_word_embeddings_without_special_tokens_custom_vocab_size( embedding_file): file_name, gold_embeddings = embedding_file vocab_size = len(VOCAB) // 2 cm_vocab, cm_embeddings = vocab.read_word_embeddings( file_name, EMBED_DIM, vocab_size, include_special_tokens=False) assert cm_vocab == (VOCAB[:vocab_size]) for i, embed in enumerate(cm_embeddings): assert list(gold_embeddings[i]) == list(embed)
def test_lookup_ops(embedding_file): fn, gold_embeds = embedding_file cm_vocab, _ = vocab.read_word_embeddings(fn, EMBED_DIM) oov_words = ['dsjii', 'disjfi'] test_strings = vocab.SPECIAL_TOKENS + VOCAB + oov_words tf.enable_eager_execution() test_strings = tf.constant(test_strings) table = vocab.get_vocab_lookup(cm_vocab) ids = table.lookup(test_strings) assert list(ids.numpy()) == list(range( len(vocab.SPECIAL_TOKENS + VOCAB))) + [vocab.OOV_TOKEN_ID] * len(oov_words)
def eval(config, data_dir, checkpoint_path=None, my_model_fn=model_fn): V, embed_matrix = vocab.read_word_embeddings( data_dir / 'word_vectors' / config.editor.wvec_path, config.editor.word_dim, config.editor.vocab_size ) estimator = get_estimator(config, embed_matrix, my_model_fn) output = estimator.evaluate( input_fn=lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V), num_examples=1e10), checkpoint_path=checkpoint_path ) return output
def augment_debug(config, debug_dataset, data_dir, checkpoint_path=None, my_model_fn=model_fn): V, embed_matrix = vocab.read_word_embeddings( data_dir / 'word_vectors' / config.editor.wvec_path, config.editor.word_dim, config.editor.vocab_size ) config.put('optim.batch_size', 1) estimator = get_estimator(config, embed_matrix, my_model_fn) with open(str(data_dir / debug_dataset), 'rb') as f: debug_examples = pickle.load(f) debugged = augment_debug_dataset(debug_examples, estimator, checkpoint_path, V) with open("%s_debugged" % debug_dataset, 'w', encoding='utf8') as f: json.dump(debugged, f)
def test_rnn_encoder(dataset_file, embedding_file): with tf.Graph().as_default(): d_fn, gold_dataset = dataset_file e_fn, gold_embeds = embedding_file v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM) vocab_lookup = vocab.get_vocab_lookup(v) dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE, NUM_EPOCH) embedding = tf.get_variable( 'embeddings', shape=embed_matrix.shape, initializer=tf.constant_initializer(embed_matrix)) iter = dataset.make_initializable_iterator() (src, tgt, iw, dw), _ = iter.get_next() EDIT_DIM = 8 output = ev.rnn_encoder(tf.nn.embedding_lookup(embedding, src), tf.nn.embedding_lookup(embedding, tgt), tf.nn.embedding_lookup(embedding, iw), tf.nn.embedding_lookup(embedding, dw), sequence.length_pre_embedding(src), sequence.length_pre_embedding(tgt), sequence.length_pre_embedding(iw), sequence.length_pre_embedding(dw), 256, 2, 256, 1, EDIT_DIM, 100.0, 0.1, 14.0, 0.8) with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ]) sess.run(iter.initializer) while True: try: oeo = sess.run(output) assert oeo.shape == (BATCH_SIZE, EDIT_DIM) except: break
def augment_meta_test(config, meta_test_path, data_dir, checkpoint_path=None, my_model_fn=model_fn): V, embed_matrix = vocab.read_word_embeddings( data_dir / 'word_vectors' / config.editor.wvec_path, config.editor.word_dim, config.editor.vocab_size ) config.put('optim.batch_size', 1) estimator = get_estimator(config, embed_matrix, my_model_fn) with open(str(data_dir / config.dataset.path / 'train.tsv'), encoding='utf8') as f: train_examples = [] for l in tqdm(f, total=util.get_num_total_lines(data_dir / config.dataset.path / 'train.tsv')): l = l[:-1] src, tgt = l.split('\t') train_examples.append((l, util.jaccard( set([w.lower() for w in src.split(' ')]), set([w.lower() for w in tgt.split(' ')]), ))) train_examples = list(filter(lambda x: 0.6 < x[1] < 0.8, train_examples)) with open(meta_test_path, 'rb') as f: meta_test = pickle.load(f) # for i, m in enumerate(tqdm(meta_test)): # dtrain = augment_dataset(train_examples, estimator, checkpoint_path, meta_test[0], V) # assert len(dtrain) == len(meta_test[0]) * (1 + NUM_CANDIDATES) # print("DTRRAAAAAAAAAAAAAIIIIIIIIIIIINNNNNNNN:", len(dtrain)) # meta_test[i] = (dtrain, meta_test[i][1], meta_test[i][2]) new_meta_test = [] all_dtrain = augment_dataset(train_examples, estimator, checkpoint_path, meta_test, V) for episode_id in all_dtrain: new_meta_test.append(( all_dtrain[episode_id], meta_test[episode_id][1], meta_test[episode_id][2], )) with open(meta_test_path + '_augmented.pkl', 'wb') as f: pickle.dump(new_meta_test, f)
def test_wtf(): with tf.Graph().as_default(): V, embed_matrix = vocab.read_word_embeddings( Path('../data') / 'word_vectors' / 'glove.6B.300d_yelp.txt', 300, 10000 ) table = vocab.create_vocab_lookup_tables(V) vocab_s2i = table[vocab.STR_TO_INT] vocab_i2s = table[vocab.INT_TO_STR] dataset = input_fn('../data/yelp_dataset_large_split/train.tsv', table, 64, 1) iter = dataset.make_initializable_iterator() (src, tgt, iw, dw), _ = iter.get_next() src_len = length_pre_embedding(src) tgt_len = length_pre_embedding(tgt) iw_len = length_pre_embedding(iw) dw_len = length_pre_embedding(dw) dec_inputs = decoder.prepare_decoder_inputs(tgt, vocab.get_token_id(vocab.START_TOKEN, vocab_s2i)) dec_output = decoder.prepare_decoder_output(tgt, tgt_len, vocab.get_token_id(vocab.STOP_TOKEN, vocab_s2i), vocab.get_token_id(vocab.PAD_TOKEN, vocab_s2i)) t_src = vocab_i2s.lookup(src) t_tgt = vocab_i2s.lookup(tgt) t_iw = vocab_i2s.lookup(iw) t_dw = vocab_i2s.lookup(dw) t_do = vocab_i2s.lookup(dec_output) t_di = vocab_i2s.lookup(dec_inputs) with tf.Session() as sess: sess.run([tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer()]) sess.run(iter.initializer) while True: try: # src, tgt, iw, dw = sess.run([src, tgt, iw, dw]) ts, tt, tiw, tdw, tdo, tdi = sess.run([t_src, t_tgt, t_iw, t_dw, t_do, t_di]) except: break
def test_num_examples(): V, embed_matrix = vocab.read_word_embeddings( 'data/word_vectors/glove.6B.300d_dbpedia.txt', 300, 10000 ) vocab_table = vocab.create_vocab_lookup_tables(V) dataset = input_fn('data/quora_naug/train.tsv', vocab_table, 10, 1) for (features, _) in dataset: base_words, extended_base_words, \ output_words, extended_output_words, \ src_words, tgt_words, \ inserted_words, deleted_words, \ oov = features oov_len = sequence.length_string(oov, vocab.PAD_TOKEN) print(features)
def predict(config, data_dir, checkpoint_path=None, my_model_fn=model_fn): V, embed_matrix = vocab.read_word_embeddings( data_dir / 'word_vectors' / config.editor.wvec_path, config.editor.word_dim, config.editor.vocab_size ) config.put('optim.batch_size', 1) estimator = get_estimator(config, embed_matrix, my_model_fn) output = estimator.predict( input_fn=lambda: eval_input_fn(config, data_dir, vocab.create_vocab_lookup_tables(V), num_examples=10), checkpoint_path=checkpoint_path ) for p in output: print(p['joined']) return output
def predict_cmd(config, data_dir, checkpoint_path=None): V, embed_matrix = vocab.read_word_embeddings( data_dir / 'word_vectors' / config.editor.wvec_path, config.editor.word_dim, config.editor.vocab_size ) config.put('optim.batch_size', 1) estimator = get_estimator(config, embed_matrix) output = estimator.predict( input_fn=lambda: input_fn_cmd(vocab.create_vocab_lookup_tables(V)), checkpoint_path=checkpoint_path ) for p in output: print('\nResult:') print(p['joined']) return output
def get_vocab_embedding_matrix(config, data_dir): if config.editor.get('use_sub_words', False): if config.editor.get('use_t2t_sub_words'): output = vocab.read_t2t_subword_embeddings(config) else: output = vocab.read_subword_embeddings(config) else: if config.editor.get('word_vocab_file_path', None): vocab_path = str(config.local_data_dir / config.dataset.path / config.editor.word_vocab_file_path) else: vocab_path = None output = vocab.read_word_embeddings( data_dir / 'word_vectors' / config.editor.wvec_path, config.editor.word_dim, config.editor.vocab_size, random_initialization=(not config.editor.get('use_pretrained_embeddings', True)), vocab_file=vocab_path ) return output
def test_context_encoder(dataset_file, embedding_file): with tf.Graph().as_default(): d_fn, gold_dataset = dataset_file e_fn, gold_embeds = embedding_file v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM) vocab_lookup = vocab.get_vocab_lookup(v) dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE, NUM_EPOCH) embedding = tf.get_variable( 'embeddings', shape=embed_matrix.shape, initializer=tf.constant_initializer(embed_matrix)) iter = dataset.make_initializable_iterator() (_, _, src, _), _ = iter.get_next() src_len = sequence.length_pre_embedding(src) src_embd = tf.nn.embedding_lookup(embedding, src) output = ev.context_encoder(src_embd, src_len, HIDDEN_DIM, NUM_LAYER) with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ]) sess.run(iter.initializer) while True: try: oeo, o_src, o_src_len, o_src_embd = sess.run( [output, src, src_len, src_embd]) assert oeo.shape == (BATCH_SIZE, o_src_len.max(), HIDDEN_DIM) except: break
def test_encoder(dataset_file, embedding_file): d_fn, gold_dataset = dataset_file e_fn, gold_embeds = embedding_file v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM) vocab_lookup = vocab.get_vocab_lookup(v) dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE, NUM_EPOCH) embedding = tf.get_variable( 'embeddings', shape=embed_matrix.shape, initializer=tf.constant_initializer(embed_matrix)) iter = dataset.make_initializable_iterator() (src, _, _, _), _ = iter.get_next() src_len = sequence.length_pre_embedding(src) src_embd = tf.nn.embedding_lookup(embedding, src) encoder_output, _ = encoder.bidirectional_encoder(src_embd, src_len, HIDDEN_DIM, NUM_LAYER, 0.9) with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ]) sess.run(iter.initializer) oeo, o_src, o_src_len, o_src_embd = sess.run( [encoder_output, src, src_len, src_embd]) for i in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): print(i) assert oeo.shape == (BATCH_SIZE, o_src_len.max(), HIDDEN_DIM)
def test_decoder_train(dataset_file, embedding_file): with tf.Graph().as_default(): d_fn, gold_dataset = dataset_file e_fn, gold_embeds = embedding_file v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM) vocab_lookup = vocab.get_vocab_lookup(v) stop_token = tf.constant(bytes(vocab.STOP_TOKEN, encoding='utf8'), dtype=tf.string) stop_token_id = vocab_lookup.lookup(stop_token) start_token = tf.constant(bytes(vocab.START_TOKEN, encoding='utf8'), dtype=tf.string) start_token_id = vocab_lookup.lookup(start_token) pad_token = tf.constant(bytes(vocab.PAD_TOKEN, encoding='utf8'), dtype=tf.string) pad_token_id = vocab_lookup.lookup(pad_token) dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE, NUM_EPOCH) iter = dataset.make_initializable_iterator() (src, tgt, inw, dlw), _ = iter.get_next() src_len = sequence.length_pre_embedding(src) tgt_len = sequence.length_pre_embedding(tgt) dec_inputs = decoder.prepare_decoder_inputs(tgt, start_token_id) dec_outputs = decoder.prepare_decoder_output(tgt, tgt_len, stop_token_id, pad_token_id) dec_inputs_len = sequence.length_pre_embedding(dec_inputs) dec_outputs_len = sequence.length_pre_embedding(dec_outputs) batch_size = tf.shape(src)[0] edit_vector = edit_encoder.random_noise_encoder( batch_size, EDIT_DIM, 14.0) embedding = tf.get_variable( 'embeddings', shape=embed_matrix.shape, initializer=tf.constant_initializer(embed_matrix)) src_embd = tf.nn.embedding_lookup(embedding, src) src_sent_embeds, final_states = encoder.source_sent_encoder( src_embd, src_len, 20, 3, 0.8) agn = agenda.linear(final_states, edit_vector, 4) dec_out = decoder.train_decoder(agn, embedding, dec_inputs, src_sent_embeds, tf.nn.embedding_lookup(embedding, inw), tf.nn.embedding_lookup(embedding, dlw), dec_inputs_len, src_len, sequence.length_pre_embedding(inw), sequence.length_pre_embedding(dlw), 5, 20, 3, False) # eval_dec_out = decoder.greedy_eval_decoder( # agn, embedding, # start_token_id, stop_token_id, # src_sent_embeds, # tf.nn.embedding_lookup(embedding, inw), # tf.nn.embedding_lookup(embedding, dlw), # src_len, sequence.length_pre_embedding(inw), sequence.length_pre_embedding(dlw), # 5, 20, 3, 40 # ) eval_dec_out = decoder.beam_eval_decoder( agn, embedding, start_token_id, stop_token_id, src_sent_embeds, tf.nn.embedding_lookup(embedding, inw), tf.nn.embedding_lookup(embedding, dlw), src_len, sequence.length_pre_embedding(inw), sequence.length_pre_embedding(dlw), 5, 20, 3, 40) # saver = tf.train.Saver(write_version=tf.train.SaverDef.V1) # s = tf.summary.FileWriter('data/an') # s.add_graph(g) # # all_print = tf.get_collection('print') an, final_states, len = dec_out stacked = decoder.attention_score(dec_out) with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ]) sess.run(iter.initializer) print(sess.run([eval_dec_out]))
def test_decoder_prepares(dataset_file, embedding_file): with tf.Graph().as_default(): d_fn, gold_dataset = dataset_file e_fn, gold_embeds = embedding_file v, embed_matrix = vocab.read_word_embeddings(e_fn, EMBED_DIM) vocab_lookup = vocab.get_vocab_lookup(v) stop_token = tf.constant(bytes(vocab.STOP_TOKEN, encoding='utf8'), dtype=tf.string) stop_token_id = vocab_lookup.lookup(stop_token) start_token = tf.constant(bytes(vocab.START_TOKEN, encoding='utf8'), dtype=tf.string) start_token_id = vocab_lookup.lookup(start_token) pad_token = tf.constant(bytes(vocab.PAD_TOKEN, encoding='utf8'), dtype=tf.string) pad_token_id = vocab_lookup.lookup(pad_token) dataset = neural_editor.input_fn(d_fn, vocab_lookup, BATCH_SIZE, NUM_EPOCH) iter = dataset.make_initializable_iterator() (_, tgt, _, _), _ = iter.get_next() tgt_len = sequence.length_pre_embedding(tgt) dec_inputs = decoder.prepare_decoder_inputs(tgt, start_token_id) dec_outputs = decoder.prepare_decoder_output(tgt, tgt_len, stop_token_id, pad_token_id) dec_inputs_len = sequence.length_pre_embedding(dec_inputs) dec_outputs_len = sequence.length_pre_embedding(dec_outputs) dec_outputs_last = sequence.last_relevant( tf.expand_dims(dec_outputs, 2), dec_outputs_len) dec_outputs_last = tf.squeeze(dec_outputs_last) with tf.Session() as sess: sess.run([ tf.global_variables_initializer(), tf.local_variables_initializer(), tf.tables_initializer() ]) sess.run(iter.initializer) while True: try: dec_inputs, dec_outputs, tgt_len, dil, dol, start_token_id, stop_token_id, dec_outputs_last, tgt = sess.run( [ dec_inputs, dec_outputs, tgt_len, dec_inputs_len, dec_outputs_len, start_token_id, stop_token_id, dec_outputs_last, tgt ]) assert list(dil) == list(dol) == list(tgt_len + 1) assert list(dec_inputs[:, 0]) == list( np.ones_like(dec_inputs[:, 0]) * start_token_id) assert list(dec_outputs_last) == list( np.ones_like(dec_outputs_last) * stop_token_id) except: break