def main(): usage = "%prog persona_dir" parser = OptionParser(usage=usage) #parser.add_option('-a', dest='alpha', default=0.1, # help='smoothing parameter for multinomial distribution: default=%default') parser.add_option('-b', dest='beta', default=0.000001, help='Regularization strength: default=%default') parser.add_option('-d', dest='hidden_dim', default=50, help='Hidden node dimension: default=%default') parser.add_option('--edge_dim', dest='edge_dim', default=3, help='Edge vector dimension: default=%default') parser.add_option('--pos_dim', dest='pos_dim', default=3, help='Edge vector dimension: default=%default') parser.add_option('-e', dest='epochs', default=20, help='Number of epochs: default=%default') parser.add_option('-i', dest='iter_display', default=4000, help='Number of iterations between output: default=%default') parser.add_option('-o', dest='optimization', default='adagrad', help='Optimization method [sgd|sgdm|adagrad]: default=%default') parser.add_option('-l', dest='learning_rate', default=0.2, help='Initial learning rate: default=%default') parser.add_option('--emb_lr', dest='emb_lr', default=0.1, help='Learning rate for embeddings (not for sgd): default=%default') parser.add_option('--decay', dest='decay', default=1.00, help='Learning rate decay: default=%default') parser.add_option('--momentum', dest='momentum', default=0.5, help='Momentum parameter (sgdm only): default=%default') parser.add_option('-s', dest='seed', default=42, help='Random seed: default=%default') parser.add_option('--glove_file', dest='glove_file', default='', help='Location of glove file: default=do not load') parser.add_option('--word2vec_file', dest='word2vec_file', default='', help='Location of word2vec file: default=do not load') parser.add_option('--n_dev', dest='n_dev', default=5000, help='Number of sentences to use as a dev set: default=%default') parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False, help='Skip the evaluation between epochs: default=%default') parser.add_option('--min_df', dest='min_df', default=2, help='Minimum document frequency for input and output vocabs: default=%default') parser.add_option('--latent_dim', dest='latent_dim', default=100, help='Minimum document frequency for input and output vocabs: default=%default') (options, args) = parser.parse_args() assert len(args) > 0 persona_dir = args[0] seed = int(options.seed) n_epochs = int(options.epochs) #alpha = float(options.alpha) beta = float(options.beta) lr = float(options.learning_rate) emb_lr = float(options.emb_lr) iter_display = int(options.iter_display) opti_method = options.optimization lr_decay = float(options.decay) momentum = float(options.momentum) glove_file = options.glove_file word2vec_file = options.word2vec_file no_eval = options.no_eval n_dev = int(options.n_dev) min_df = int(options.min_df) if seed > 0: np.random.seed(seed) random.seed(seed) dh = int(options.hidden_dim) de = int(options.edge_dim) dt = int(options.pos_dim) dx = 300 dz = int(options.latent_dim) np.__config__.show() # load input data input_filename = os.path.join(persona_dir, 'rnn_data.json') with codecs.open(input_filename, 'r', encoding='utf-8') as input_file: orig_data = json.load(input_file) n_entities = len(orig_data) print "Loaded", n_entities, "entities" tuple_type_set = set() data = [] mention_id = 0 n_tuples = 0 for d_i, d in enumerate(orig_data): words = [] doc_id = d[0] entity_id = d[1] appearances = d[2] for tuples in appearances: n_tuples += len(tuples) for t in tuples: tuple_id = t[0] #orig_index = t[1] role_type = t[1] word = t[2] arc = t[3] head = t[4] pos = t[5] head_phrase = t[6] words.append(word) #words.append(tuples[0][5]) words = [START] + words + [END] item = {'doc_id': doc_id, 'entity_id': entity_id, 'words': words} data.append(item) print len(data), "appearances" print n_tuples, "tuples" print np.bincount([len(d['words']) for d in data]) print "Building vocabulary" #attribute_counts = Counter() #agent_counts = Counter() #patient_counts = Counter() word_counts = Counter() for d in data: #attribute_counts.update(d['attributes']) #agent_counts.update(d['agent_roles']) #patient_counts.update(d['patient_roles']) word_counts.update(d['words']) n_entities = len(data) train_set = set(range(n_entities)) dev_set = set(np.random.choice(n_entities, n_dev).tolist()) train_set = train_set - dev_set print "train:", len(train_set), " dev", len(dev_set) # pad sentences with start and end tokens, and adjust target index accordingly train_data = [data[i] for i in train_set] dev_data = [data[i] for i in dev_set] #attribute_counts.update([UNK]) #agent_counts.update([UNK]) #patient_counts.update([UNK]) #attribute_vocab = [w for w, c in attribute_counts] #agent_vocab = [w for w, c in agent_counts] #patient_vocab = [w for w, c in patient_counts] word_counts.update(UNK) vocab = [w for w, c in word_counts.items()] #print "Attribute vocab size =", len(attribute_vocab) #print "Agent vocab size =", len(agent_vocab) #print "Patient vocab size =", len(patient_vocab) #attribute_vocab.sort() #agent_vocab.sort() #patient_vocab.sort() vocab_size = len(vocab) print "full vocab size =", vocab_size vocab.sort() #attribute_vocab_index = dict(zip(attribute_vocab, range(len(attribute_vocab)))) #agent_vocab_index = dict(zip(agent_vocab, range(len(agent_vocab)))) #patient_vocab_index = dict(zip(patient_vocab, range(len(patient_vocab)))) vocab_index = dict(zip(vocab, range(vocab_size))) print "Indexing words" for t in data: t['idxs'] = [vocab_index[w] if w in vocab_index else vocab_index[UNK] for w in t['words']] initial_embeddings = None if glove_file != '': initial_embeddings = np.zeros([vocab_size, dx], dtype=np.float32) print "Loading glove vectors" glove_vocab, glove_embeddings = load_vectors.load_glove_vectors(glove_file, vocab) glove_index = dict(zip(glove_vocab, range(len(glove_vocab)))) for w_i, w in enumerate(vocab): if w in glove_index: initial_embeddings[w_i, :] = glove_embeddings[glove_index[w], :] else: initial_embeddings[w_i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx)) print len(list(set(vocab) - set(glove_vocab))), "words in training vocabulary with no glove vector" #if not no_eval: # print len(list(dev_vocab - set(glove_vocab))), "words in dev vocabulary with no glove vector" # print len(list(test_vocab - set(glove_vocab))), "words in test vocabulary with no glove vector" elif word2vec_file != '': # load pre-trained word vectors print "Loading pre-trained word vectors" vectors = gensim.models.Word2Vec.load_word2vec_format(word2vec_file, binary=True) word2vec_vocab = set() print "Preparing word vectors" initial_embeddings = np.zeros([vocab_size, dx], dtype=np.float32) for v_i, v in enumerate(vocab): if v in vectors: initial_embeddings[v_i, :] = vectors[v] word2vec_vocab.add(v) else: initial_embeddings[v_i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx)) print len(list(set(vocab) - word2vec_vocab)), "words in training vocabulary with no word2vec vector" #if not no_eval: # print len(list(dev_vocab - word2vec_vocab)), "words in dev vocabulary with no word2vec vector" # print len(list(test_vocab - word2vec_vocab)), "words in test vocabulary with no word2vec vector" # create the LSTM theano_seed = np.random.randint(2 ** 30) #tnn = TreeRNN(word_vocab_size, edge_vocab_size, pos_vocab_size, dh, dx, de, dt, beta=beta, # initial_embeddings=initial_embeddings, initial_edge_embeddings=initial_edge_embeddings, # update=opti_method, momentum=momentum, seed=theano_seed) # create generator LAE = LinearAutoencoder(vocab_size, dh, dx, dz, beta=beta, initial_embeddings=initial_embeddings, update=opti_method, momentum=momentum, seed=theano_seed) # create TreeAutoencoder best_dev_log_loss = 1e6 print "Training" for epoch in range(n_epochs): sum_log_loss = 0 sum_loss = 0 mistakes = 0 pos_mistakes = 0 n_items = 0 keys = range(len(train_data)) random.shuffle(keys) for k_i, k in enumerate(keys): d = train_data[k] word_idxs = d['idxs'] seq_length = len(word_idxs) n_words = seq_length - 2 n_items += n_words idxs_array = np.array(np.reshape(word_idxs, (seq_length, 1)), dtype=np.int32) mask = np.ones((seq_length, 1), dtype=np.int32) pred_y, p_y_given_x, log_loss, loss, mu, log_sigma, KLD = LAE.train(idxs_array, mask) sum_log_loss += log_loss sum_loss += loss mistakes += np.sum(pred_y != idxs_array[1:n_words+1, :]) if k_i % iter_display == 0 and k_i > 0: #print [word_vocab[w] for w in word_idxs] # print np.max(h_p), np.mean(h_p), np.array(h_p) denom = float(n_items) #print d['word'], d['edge'], d['output'], output_vocab[true_y], output_vocab[pred_y], p_y_given_x[true_y], p_y_given_x[pred_y] print '%d\t%d\t%.4f\t%.4f\t%.4f\t%.6f' % \ (epoch, k_i, sum_log_loss/denom, sum_loss/denom, mistakes/denom, KLD), ' '.join([vocab[i] for i in word_idxs]), ' '.join([vocab[i] for i in pred_y[:, 0]]) start_idxs = vocab_index[START] * np.ones((1,), dtype=np.int32) pred_y, pred_y_blind = LAE.predict_both(idxs_array, mask, start_idxs) print ' '.join([vocab[i] for i in pred_y[:, 0]]) print ' '.join([vocab[i] for i in pred_y_blind[:, 0]]) #print mu #print log_sigma #KLD = T.mean(0.5 * T.sum(1 + log_sigma - mu**2 - T.exp(log_sigma), axis=1)) #print 0.5 * np.sum(1 + log_sigma - mu ** 2 - np.exp(log_sigma), axis=1) if k_i % 50000 == 0 and k_i > 0: if not no_eval: zo_loss, dev_log_loss_words = autoencoder_evaluate(dev_data, LAE, vocab, print_examples=True) print "Dev evaluation at", k_i, ":", zo_loss, dev_log_loss_words dev_log_loss = dev_log_loss_words if best_dev_log_loss > dev_log_loss: best_dev_log_loss = dev_log_loss print "New best dev log loss =", dev_log_loss #print "Saving vectors" #output_basename = os.path.join(persona_dir, 'vectors_auto') #save_vectors(data, LAE, n_tuples, dh, output_basename=output_basename) #phi = np.array(tae.get_phi())[0] #save_phi(phi, edge_vocab, word_vocab, output_basename) if not no_eval: zo_loss, dev_log_loss_words = autoencoder_evaluate(dev_data, LAE, vocab, print_examples=True) print "Dev evaluation after epoch", epoch, ":", zo_loss, dev_log_loss_words dev_log_loss = dev_log_loss_words if best_dev_log_loss > dev_log_loss: best_dev_log_loss = dev_log_loss print "New best dev log loss =", dev_log_loss output_basename = os.path.join(persona_dir, 'vectors_auto_' + str(epoch)) #print "Saving vectors to", output_basename #save_vectors(data, LAE, n_tuples, dh, output_basename=output_basename) #phi = np.array(tae.get_phi())[0] #save_phi(phi, edge_vocab, word_vocab, output_basename) lr *= lr_decay
def main(): usage = "%prog input_filename" parser = OptionParser(usage=usage) parser.add_option('-a', dest='alpha', default=0.000001, help='Regularization strength: default=%default') parser.add_option('-d', dest='hidden_dim', default=100, help='Hidden node dimension: default=%default') parser.add_option('-e', dest='epochs', default=20, help='Number of epochs: default=%default') parser.add_option('-i', dest='iter_display', default=4000, help='Number of iterations between output: default=%default') parser.add_option('-o', dest='optimization', default='adagrad', help='Optimization method [sgd|sgdm|adagrad]: default=%default') parser.add_option('-l', dest='learning_rate', default=0.05, help='Initial learning rate: default=%default') parser.add_option('--emb_lr', dest='emb_lr', default=0.01, help='Learning rate for embeddings (not for sgd): default=%default') parser.add_option('--decay', dest='decay', default=1.00, help='Learning rate decay: default=%default') parser.add_option('--momentum', dest='momentum', default=0.5, help='Momentum parameter (sgdm only): default=%default') parser.add_option('-s', dest='seed', default=42, help='Random seed: default=%default') parser.add_option('--glove_file', dest='glove_file', default='', help='Location of glove file: default=do not load') parser.add_option('--word2vec_file', dest='word2vec_file', default='', help='Location of word2vec file: default=do not load') parser.add_option('--n_dev', dest='n_dev', default=4000, help='Number of sentences to use as a dev set: default=%default') parser.add_option('--min_wf', dest='min_wf', default=5, help='Exclude words that occur less than this number of times: default=%default') parser.add_option('--no_eval', action="store_true", dest="no_eval", default=False, help='Skip the evaluation between epochs: default=%default') #parser.add_option('--drop_x', action="store_true", dest="drop_x", default=False, # help='Add dropout to the input layer: default=%default') (options, args) = parser.parse_args() input_filename = args[0] #lr = 0.002 # 0.05 / batch_size=25 #alpha = 0.000002 # 10^-4 / batch_size=25 / 2.0 load_all_word_vectors = False seed = int(options.seed) n_epochs = int(options.epochs) alpha = float(options.alpha) lr = float(options.learning_rate) emb_lr = float(options.emb_lr) iter_display = int(options.iter_display) opti_method = options.optimization lr_decay = float(options.decay) momentum = float(options.momentum) glove_file = options.glove_file word2vec_file = options.word2vec_file no_eval = options.no_eval n_dev = int(options.n_dev) min_wf = int(options.min_wf) #drop_x = int(options.drop_x) if seed > 0: np.random.seed(seed) random.seed(seed) dh = int(options.hidden_dim) dx = 300 np.__config__.show() print THEANO_FLAGS # load sentences with codecs.open(input_filename, 'r', encoding='utf-8') as input_file: data = json.load(input_file) n_sentences = len(data) print "Loaded", n_sentences, "sentences" train_set = set(range(n_sentences)) dev_set = set(np.random.choice(n_sentences, n_dev).tolist()) train_set = train_set - dev_set # pad sentences with start and end tokens, and adjust target index accordingly train_data = [{'target': data[i][0] + 1, 'words': ['__START__'] + data[i][1].split() + ['__END__']} for i in train_set] dev_data = [{'target': data[i][0] + 1, 'words': ['__START__'] + data[i][1].split() + ['__END__']} for i in dev_set] print "Building vocabulary" vocab = Counter() target_vocab = Counter() for d in train_data: vocab.update(d['words']) target_vocab.update([d['words'][d['target']]]) print "Filtering vocabulary" vocab = [v for v, c in vocab.items() if c >= min_wf] vocab.append('__UNK__') target_vocab = [v for v, c in target_vocab.items() if c > 1] target_vocab.append('__UNK__') vocab = list(vocab) vocab.sort() vocab_size = len(vocab) vocab_index = dict(zip(vocab, range(vocab_size))) print "Size of full vocab =", vocab_size target_vocab = list(target_vocab) target_vocab.sort() target_vocab_size = len(target_vocab) target_vocab_index = dict(zip(target_vocab, range(target_vocab_size))) print "Size of target vocab =", target_vocab_size nc = target_vocab_size if glove_file != '': initial_embeddings = np.zeros([vocab_size, dx], dtype=np.float32) print "Loading glove vectors" glove_vocab, glove_embeddings = load_vectors.load_glove_vectors(glove_file, vocab) glove_index = dict(zip(glove_vocab, range(len(glove_vocab)))) for w_i, w in enumerate(vocab): if w in glove_index: initial_embeddings[w_i, :] = glove_embeddings[glove_index[w], :] else: initial_embeddings[w_i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx)) print len(list(set(vocab) - set(glove_vocab))), "words in training vocabulary with no glove vector" #if not no_eval: # print len(list(dev_vocab - set(glove_vocab))), "words in dev vocabulary with no glove vector" # print len(list(test_vocab - set(glove_vocab))), "words in test vocabulary with no glove vector" elif word2vec_file != '': # load pre-trained word vectors print "Loading pre-trained word vectors" vectors = gensim.models.Word2Vec.load_word2vec_format(word2vec_file, binary=True) word2vec_vocab = set() print "Preparing word vectors" initial_embeddings = np.zeros([vocab_size, dx], dtype=np.float32) for v, i in vocab_index.items(): if v == '_': initial_embeddings[i, :] = np.zeros(dx) elif v in vectors: initial_embeddings[i, :] = vectors[v] word2vec_vocab.add(v) else: initial_embeddings[i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx)) print len(list(set(vocab) - word2vec_vocab)), "words in training vocabulary with no word2vec vector" #if not no_eval: # print len(list(dev_vocab - word2vec_vocab)), "words in dev vocabulary with no word2vec vector" # print len(list(test_vocab - word2vec_vocab)), "words in test vocabulary with no word2vec vector" initial_embeddings = np.zeros([vocab_size, 2]) for i in range(40): initial_embeddings[i, :] = i print "Indexing words" for t in train_data: t['idxs'] = [vocab_index[w] if w in vocab_index else vocab_index['__UNK__'] for w in t['words']] target_word = t['words'][t['target']] if target_word in target_vocab_index: t['target_idx'] = target_vocab_index[target_word] else: t['target_idx'] = target_vocab_index['__UNK__'] for t in dev_data: t['idxs'] = [vocab_index[w] if w in vocab_index else vocab_index['__UNK__'] for w in t['words']] target_word = t['words'][t['target']] if target_word in target_vocab_index: t['target_idx'] = target_vocab_index[target_word] else: t['target_idx'] = target_vocab_index['__UNK__'] # create the LSTM theano_seed = np.random.randint(2 ** 30) ctreeLSTM = ConstituencyTreeLSTM(vocab_size, dh, 2, nc, initial_embeddings=initial_embeddings, alpha=alpha, update=opti_method, seed=theano_seed, momentum=momentum) sent_lengths = [(len(t['words']), key) for key, t in enumerate(train_data)] sent_lengths.sort() if not no_eval: print "Pre-training evaluation" #train_z_o_loss, train_log_loss = evaluate(dev_data, ctreeLSTM, vocab_index, drop_x) print "Dev evaluation:", evaluate(dev_data, ctreeLSTM, vocab_index) #test_z_o_loss, test_log_loss = evaluate(test_root_trees, ctreeLSTM, vocab_index, drop_x) #print ('epoch=%d\ttrain_0/1=%.3f\ttrain_log=%.3f\tdev_0/1=%.3f\tdev_log=%.3f\ttest_0/1=%.3f\ttest_log=%.3f') % \ # (-1, train_z_o_loss, train_log_loss, valid_z_o_loss, valid_log_loss, test_z_o_loss, test_log_loss) print "Training" for epoch in range(n_epochs): sum_log_loss = 0 sum_loss = 0 mistakes = 0 pred0 = 0 pred1 = 0 if epoch == 0: print "sorting trees" keys = [key for length, key in sent_lengths] else: keys = range(len(train_data)) random.shuffle(keys) print "epoch\titems\tloss\tl+reg\terrs\tpredict 1" for k_i, t_i in enumerate(keys): t = train_data[t_i] words = t['words'] idxs = t['idxs'] target = t['target'] value = t['target_idx'] #idxs = [vocab_index[w] for w in words] counter = np.array(np.arange(0, len(idxs)), dtype=np.int32) print counter #pred_y, p_y_given_x, shape = ctreeLSTM.predict_prob(idxs, left_mask, right_mask, counter) pred_y, p_y_given_x, log_loss, loss, c = ctreeLSTM.train(counter, target, value, lr, emb_lr, 1) c = np.array(c) print c sys.exit() sum_log_loss += log_loss sum_loss += loss if pred_y != value: mistakes += 1 if k_i % iter_display == 0: d = float(k_i+1) print ' '.join(words[1:-1]), ':', target_vocab[value], target_vocab[pred_y], p_y_given_x[value], p_y_given_x[pred_y] print '%d\t%d\t%.4f\t%.4f\t%.4f' % \ (epoch, k_i, sum_log_loss/d, sum_loss/d, mistakes/d) if not no_eval: #train_z_o_loss, train_log_loss = evaluate(train_root_trees, ctreeLSTM, vocab_index, drop_x) print "Dev evaluation:", evaluate(dev_data, ctreeLSTM, vocab_index) print "Saving results" output_filename = 'vectors' + str(epoch) + '.json' print "Train evaluation:", evaluate(train_data, ctreeLSTM, vocab_index, save_vectors=True, output_filename=output_filename) #test_z_o_loss, test_log_loss = evaluate(test_root_trees, ctreeLSTM, vocab_index, drop_x) #print ('epoch=%d\ttrain_0/1=%.3f\ttrain_log=%.3f\tdev_0/1=%.3f\tdev_log=%.3f\ttest_0/1=%.3f\ttest_log=%.3f') % \ # (epoch, train_z_o_loss, train_log_loss, valid_z_o_loss, valid_log_loss, test_z_o_loss, test_log_loss) lr *= lr_decay