def delete(neg_filepath, pos_filepath, mode='train'): ''' divide a sentence into attribute markers and contents. ''' neg_sentences = load_sentences(neg_filepath) pos_sentences = load_sentences(pos_filepath) if not (os.path.exists(hp.data_path + '/tf_idf.0') and os.path.exists(hp.data_path + '/tf_idf.1')): print('making tf_idf dictionary...') # n-gram neg_ngram_dict = get_ngram(neg_sentences) pos_ngram_dict = get_ngram(pos_sentences) # relative frequency get_tf_idf(neg_ngram_dict, pos_ngram_dict) neg_ngrams = dict( x.split('\t') for x in codecs.open(hp.data_path + '/tf_idf.0', 'r').read().splitlines()) pos_ngrams = dict( x.split('\t') for x in codecs.open(hp.data_path + '/tf_idf.1', 'r').read().splitlines()) # divide sentence into attribute markers and contents # neg:0 pos:1 print('dividing sentences...') for num in ['0', '1']: if num == '0': sentences = neg_sentences attribute_markers = neg_ngrams else: sentences = pos_sentences attribute_markers = pos_ngrams with codecs.open(hp.data_path + '/delete/delete.' + mode + '.' + num, 'w', 'utf-8') as fout: for sentence in sentences: words = sentence.split(' ') value_dict = {} for i in range(len(words)): for n in range(hp.ngram - 1, 0, -1): if i + n > len(words): continue tmp_attribute_marker = ' '.join(words[i:i + n]) if tmp_attribute_marker in attribute_markers: value_dict[ tmp_attribute_marker] = attribute_markers[ tmp_attribute_marker] # if a sentence has attribute_marker if len(value_dict) > 0: attribute_marker = max(value_dict, key=value_dict.get) content = sentence.replace(attribute_marker, '') if len(content) > 2: fout.write(sentence + '\t' + content + '\t' + attribute_marker + '\n')
def main(): #load data # x_train, y_train, x_test, y_test = data.load_data('treebank') sentences, labels, n_classes = data.load_sentences() n_input = sentences.shape[0] time_steps = sentences.shape[1] embeddings = embed.embeddings() x_train = embeddings.train_embedding(sentences.reshape(n_input*time_steps)) x_train = x_train.reshape(n_input, time_steps, 100) print(x_train.shape) # # print(x_train.shape) # #replace words with embeddings # # path = 'glove.840B.300d.txt' # # x_train, y_train = embeddings.embed(path, x_train, y_train) # # x_train_id, y_train_id, x_test_id, y_test_id = embeddings.make_dictionary(x_train, y_train, x_test, y_test) # x_train = x_train.reshape(batch_size, time_steps, 100) train_using_lstm(x_train, labels, 400, n_classes, time_steps)
vocab1 = build_vocab(v1, [FLAGS.train, FLAGS.test]) vocab2 = build_vocab(v2, [FLAGS.train, FLAGS.test]) embed1 = Word2VecModel(FLAGS.embed1, vocab1, FLAGS.unif) print('Loaded word embeddings: ' + FLAGS.embed1) if FLAGS.embed2 is None: print('No embed2 found, using embed1 for both') args.embed2 = args.embed1 embed2 = Word2VecModel(FLAGS.embed2, vocab2, FLAGS.unif) print('Loaded word embeddings: ' + FLAGS.embed2) ts = load_sentences(FLAGS.train, embed1.vocab, embed2.vocab, FLAGS.mxlen) es = load_sentences(FLAGS.test, embed1.vocab, embed2.vocab, FLAGS.mxlen) rlut1 = revlut(embed1.vocab) rlut2 = revlut(embed2.vocab) #with tf.device('/cpu:0'): with tf.Graph().as_default(): sess = tf.Session() with sess.as_default(): if FLAGS.attn is True: seq2seq_creator_fn = Seq2Seq.create_lstm_attn if FLAGS.rnntype.lower( ) == 'lstm' else Seq2Seq.create_gru_attn else: seq2seq_creator_fn = Seq2Seq.create_lstm if FLAGS.rnntype.lower( ) == 'lstm' else Seq2Seq.create_gru
else: next_value = beam_multinomial(SAMPLE_PRUNE_INIT, output) if next_value == EOS: break sent = lookup_sentence(rlut2, dst_i.squeeze()) print('Guess: %s' % sent) print( '------------------------------------------------------------------------' ) f2i = {} seq2seq = Seq2SeqModel() BASE = 'seq2seq' with tf.Graph().as_default(): sess = tf.Session() with sess.as_default(): seq2seq.restore(sess, FLAGS.indir, BASE, FLAGS.mxlen) rlut1 = revlut(seq2seq.vocab1) rlut2 = revlut(seq2seq.vocab2) es = load_sentences(FLAGS.test, seq2seq.vocab1, seq2seq.vocab2, FLAGS.mxlen, FLAGS.batchsz) init = tf.global_variables_initializer() sess.run(init) show_batch(seq2seq, es, sess, rlut1, rlut2, seq2seq.vocab2, FLAGS.sample)
import tensorflow as tf import os from tqdm import tqdm import random from data import load_sentences, create_data, load_vocab from hyperparams import Hyperparams as hp from graph import Graph import argparse import numpy as np if __name__ == '__main__': if not os.path.exists(hp.logdir): os.makedirs(hp.logdir) # data load sents = load_sentences(hp.data_path + '/1-billion-word.train', False) valid_sents = load_sentences(hp.data_path + '/1-billion-word.dev', False) _, i2w = load_vocab() print('Creating datas...') X, Y, _, _ = create_data(sents) valid_X, valid_Y, _, _ = create_data(valid_sents) # mode g = Graph() data_size = X.shape[0] data_list = list(range(data_size)) with g.graph.as_default(): saver = tf.train.Saver() with tf.Session() as sess: # Initialize sess.run(tf.global_variables_initializer())
def inference(mode): # data load neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1', False) word_w2i, word_i2w = load_vocab() # mode if mode == 'delete_only': # assert(DeleteOnlyGraph(X, Y, attribute_labels)) g = DeleteOnlyGraph() else: g = DeleteAndRetrieveGraph() with g.graph.as_default(), tf.Session() as sess: sv = tf.train.Saver() # Restore parameters print("Parameter Restoring...") sv.restore(sess, './logdir/' + mode + '/model.ckpt') # Inference if not os.path.exists(hp.data_path + '/generate'): os.mkdir(hp.data_path + '/generate') for num in ['0', '1']: if num == '0': X, Y, A, Attribute_labels, Sources, Targets, Attributes = create_data( neg_lines, 1, mode='inference') else: X, Y, A, Attribute_labels, Sources, Targets, Attributes = create_data( pos_lines, 0, mode='inference') with codecs.open( hp.data_path + '/generate/' + mode + '.test.' + num, "w", "utf-8") as fout: for i in range(len(X)): x = X[i:i + 1] attri_label = np.array(Attribute_labels[i:i + 1]).reshape( (-1, 1)) a = A[i:i + 1] sources = Sources[i:i + 1] targets = Targets[i:i + 1] preds = np.zeros((1, hp.max_len), np.int32) for j in range(hp.max_len): if mode == 'delete_only': _preds = sess.run(g.pred, { g.x: x, g.y: preds, g.attributes: attri_label }) else: _preds = sess.run(g.pred, { g.x: x, g.y: preds, g.a: a }) preds[:, j] = _preds[:, j] for source, target, pred in zip(sources, targets, preds): # sentence-wise got = " ".join( word_i2w[idx] for idx in pred).split("<END>")[0].strip() fout.write("- expected: " + target + "\n") fout.write("- got: " + got + "\n\n") fout.flush()
if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-mode', action='store', dest='mode', type=str, default='delete_only', help='Enter train mode') par_args = parser.parse_args() hp.neural_mode = par_args.mode if not os.path.exists('logdir/' + hp.neural_mode): os.makedirs('logdir/' + hp.neural_mode) # data load neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1', False) valid_neg_lines = load_sentences(hp.data_path + '/delete/delete.dev.0', False) valid_pos_lines = load_sentences(hp.data_path + '/delete/delete.dev.1', False) print('Creating datas...') neg_X, neg_Y, neg_A, neg_attribute_labels, _, _, _ = create_data( neg_lines, 0) pos_X, pos_Y, pos_A, pos_attribute_labels, _, _, _ = create_data( pos_lines, 1) X = np.concatenate([neg_X, pos_X], axis=0) Y = np.concatenate([neg_Y, pos_Y], axis=0) A = np.concatenate([neg_A, pos_A], axis=0) a_labels = np.array(neg_attribute_labels + pos_attribute_labels).reshape(
vocab1 = build_vocab(v1, {args.train, args.test}) vocab2 = build_vocab(v2, {args.train, args.test}) embed1 = Word2VecModel(args.embed1, vocab1, args.unif) print('Loaded word embeddings: ' + args.embed1) if args.embed2 is None: print('No embed2 found, using embed1 for both') args.embed2 = args.embed1 embed2 = Word2VecModel(args.embed2, vocab2, args.unif) print('Loaded word embeddings: ' + args.embed2) ts = load_sentences(args.train, embed1.vocab, embed2.vocab, args.mxlen, args.batchsz, long_0_tensor_alloc) es = load_sentences(args.test, embed1.vocab, embed2.vocab, args.mxlen, args.batchsz, long_0_tensor_alloc) rlut1 = revlut(embed1.vocab) rlut2 = revlut(embed2.vocab) Seq2SeqModelType = Seq2SeqAttnModel if args.attn else Seq2SeqModel print(Seq2SeqModelType) seq2seq = Seq2SeqModelType(embed1, embed2, args.mxlen, args.hsz, args.layers, args.rnntype) trainer = Trainer(gpu, seq2seq, args.optim, args.eta, args.mom) err_min = 1 last_improved = 0 reset = 0
args = parser.parse_args() return args if __name__ == '__main__': args = parse_arguments() data = load_data() sorted_indices = np.argsort([len(d) for d in data]) data = [data[i] for i in sorted_indices] normalized_data, m, s = normalize_strokes(data) m = m.tolist() s = s.tolist() sentences = load_sentences() sentences = [sentences[i] for i in sorted_indices] alphabet, alphabet_dict = compute_alphabet(sentences) sentence_vars = [ Variable(torch.from_numpy(sentence_to_vectors( s, alphabet_dict)).float().cuda(), requires_grad=False) for s in tqdm(sentences, desc="Converting Sentences") ] # Create RNN m = [0.41261038184165955, -0.006002499256283045] s = [2.0667049884796143, 1.8475052118301392] if args.unconditioned: rnn = GeneratorRNN(num_components=20, mean=m, std=s).cuda() elif args.conditioned:
from hyperparams import Hyperparams as hp from data import load_sentences, make_dict import os from delete import delete print("Make word dictionary...") neg_lines = load_sentences(hp.data_path + '/sentiment.train.0', True) pos_lines = load_sentences(hp.data_path + '/sentiment.train.1', True) make_dict(neg_lines + pos_lines) # delete file print("Make delete file...") if not os.path.exists(hp.data_path + '/delete'): os.makedirs(hp.data_path + '/delete') delete(hp.data_path + '/sentiment.train.0', hp.data_path + '/sentiment.train.1', mode='train') delete(hp.data_path + '/sentiment.dev.0', hp.data_path + '/sentiment.dev.1', mode='dev') delete(hp.data_path + '/sentiment.test.0', hp.data_path + '/sentiment.test.1', mode='test')
unif = 0 if FLAGS.static else FLAGS.unif #w2vModel = Word2VecModel(FLAGS.glove_embed_file, vocab, unif) w2vModel = models[int(sys.argv[1])](FLAGS.glove_embed_file, vocab, unif) # Load data print("Loading data...") x_text, y = data_helpers.load_data_and_labels(FLAGS.data_file) print(len(x_text)) # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) max_document_length = 100 # vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) # x = np.array(list(vocab_processor.fit_transform(x_text))) dataset = load_sentences([(y1, x) for x, y1 in zip(x_text, y)], w2vModel.vocab, FLAGS.clean, FLAGS.chars, max_document_length) x_train = dataset.x print(dataset.x.shape) y_train = dataset.y x_text_test, y_test = data_helpers.load_data_and_labels(FLAGS.test_file) dataset2 = load_sentences([(y1, x) for x, y1 in zip(x_text_test, y_test)], w2vModel.vocab, FLAGS.clean, FLAGS.chars, max_document_length) x_test = dataset2.x print(dataset2.x.shape) y_test = dataset2.y # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train)))
def template_based(): if not os.path.exists(hp.data_path + '/generate'): os.makedirs(hp.data_path + '/generate') print('template_based...') for num in ['0', '1']: if num == '0': neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1', False) neg_sentences_contents_dict = dict( x.split('\t')[:2] for x in neg_lines) pos_sentences_contents_dict = dict( x.split('\t')[:2] for x in pos_lines) pos_sentences_marker_dict = dict( [x.split('\t')[0], x.split('\t')[2]] for x in pos_lines) sentences1 = neg_sentences_contents_dict sentences2 = pos_sentences_contents_dict marker2 = pos_sentences_marker_dict else: neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1', False) neg_sentences_contents_dict = dict( x.split('\t')[:2] for x in neg_lines) pos_sentences_contents_dict = dict( x.split('\t')[:2] for x in pos_lines) neg_sentences_marker_dict = dict( [x.split('\t')[0], x.split('\t')[2]] for x in neg_lines) sentences1 = pos_sentences_contents_dict sentences2 = neg_sentences_contents_dict marker2 = neg_sentences_marker_dict with codecs.open(hp.data_path + '/generate/template_based.test.' + num, 'w', 'utf-8') as fout: for sentence1 in sentences1: dist_dict = {} # Search up to hp.max_candidates randomly. frag_sentences2 = random.sample(sentences2.keys(), hp.max_candidates) sentence1_content = sentences1[sentence1] for sentence2 in frag_sentences2: # distance between pos_content and neg_content dist_dict[sentence2] = levenshtein_distance( sentence1_content, sentences2[sentence2]) min_sentence = min(dist_dict, key=dist_dict.get) nearest_marker = marker2[min_sentence] sentence1_list = sentence1.split(' ') sentence1_content_list = sentences1[sentence1].split(' ') # Insert attribute markers in contents index = 0 for idx in range(len(sentence1_list)): if sentence1_list[idx] != sentence1_content_list[idx]: index = idx break generated_sentence = ' '.join(sentence1_content_list[:index]) + ' ' + \ nearest_marker + ' ' + ' '.join(sentence1_content_list[index:]) generated_sentence = generated_sentence.replace(' ', ' ') fout.write("- expected: " + sentence1 + "\n") fout.write("- got: " + generated_sentence + "\n\n") fout.flush()
def retrieve_only(dist_mode='levenshtein'): print('retrieve_only with ' + dist_mode + ' distance...') for num in ['0', '1']: if num == '0': neg_lines = load_sentences(hp.data_path + '/delete/delete.test.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.train.1', False) neg_sentences_dict = dict(x.split('\t')[:2] for x in neg_lines) pos_sentences_dict = dict(x.split('\t')[:2] for x in pos_lines) sentences1 = neg_sentences_dict sentences2 = pos_sentences_dict else: neg_lines = load_sentences(hp.data_path + '/delete/delete.train.0', False) pos_lines = load_sentences(hp.data_path + '/delete/delete.test.1', False) neg_sentences_dict = dict(x.split('\t')[:2] for x in neg_lines) pos_sentences_dict = dict(x.split('\t')[:2] for x in pos_lines) sentences1 = pos_sentences_dict sentences2 = neg_sentences_dict with codecs.open(hp.data_path + '/generate/retrieve_only.test.' + num, 'w', 'utf-8') as fout: # Levenshtein distance if dist_mode == 'levenshtein': for sentence1 in sentences1: dist_dict = {} # Search up to hp.max_candidates randomly. frag_sentences2 = random.sample(sentences2.keys(), hp.max_candidates) for sentence2 in frag_sentences2: # distance between pos_content and neg_content dist_dict[sentence2] = levenshtein_distance( sentences1[sentence1], sentences2[sentence2]) nearest_sentence = min(dist_dict, key=dist_dict.get) fout.write("- expected: " + sentence1 + "\n") fout.write("- got: " + nearest_sentence + "\n\n") fout.flush() # Embedding distance between sentence1,sentence2 by using "universal sentence encoder[1]" # but it's too slow and not good performance if dist_mode == 'embedding': embed = hub.Module( "https://tfhub.dev/google/universal-sentence-encoder/1") with tf.Session() as session: session.run([ tf.global_variables_initializer(), tf.tables_initializer() ]) embedded_sentences1 = session.run( embed(sentences1.values())) for sentence1, embedded_sentence1 in zip( sentences1.keys(), embedded_sentences1): dist_dict = {} # Search up to hp.max_candidates randomly. frag_sentences2 = random.sample( sentences2.keys(), hp.max_candidates) frag_contents2 = [] for frag_sentence2 in frag_sentences2: frag_contents2.append(sentences2[frag_sentence2]) embedded_sentences2 = session.run( embed(frag_contents2)) for idx, embedded_sentence2 in enumerate( embedded_sentences2): dist_dict[idx] = np.inner(embedded_sentence1, embedded_sentence2) nearest_idx = max(dist_dict, key=dist_dict.get) nearest_sentence = frag_sentences2[nearest_idx] fout.write("- expected: " + sentence1 + "\n") fout.write("- got: " + nearest_sentence + "\n\n") fout.flush()
# Constants # paths to files # To stored mapping file mapping_file = './data/mapping.pkl' # To stored model name = parameters['name'] model_name = models_path + name # get_name(parameters) if not os.path.exists(models_path): os.makedirs(models_path) # ##### Load data and preprocess train_sentences = load_sentences(parameters['train'], parameters['zeros']) test_sentences = load_sentences(parameters['test'], parameters['zeros']) dev_sentences = load_sentences(parameters['dev'], parameters['zeros']) update_tag_scheme(train_sentences, parameters['tag_scheme']) update_tag_scheme(dev_sentences, parameters['tag_scheme']) update_tag_scheme(test_sentences, parameters['tag_scheme']) print(train_sentences[0]) print(dev_sentences[0]) print(test_sentences[0]) dico_words, word_to_id, id_to_word = to_word_mapping(train_sentences, parameters['lower']) dico_chars, char_to_id, id_to_char = to_char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = to_tag_mapping(train_sentences)