def main(): logger.info("Creating vocabulary dictionary...") vocab = Dictionary.from_corpus(train_data, unk='<unk>') logger.info("Creating tag dictionary...") vocab_tags = Dictionary.from_corpus_tags(train_data, unk='<unk>') vocab.add_word('<s>') vocab.add_word('</s>') V = vocab.size() vocab_tags.add_word('<s>') vocab_tags.add_word('</s>') V_tag = vocab_tags.size() feature_matrix = np.zeros((vocab_tags.size(), vocab_tags.num_sub_tags)) feature_matrix[(0, 0)] = 1 # unk encoding for tag, tag_id in vocab_tags: if tag == "<s>": feature_matrix[(tag_id, 1)] = 1 elif tag == "</s>": feature_matrix[(tag_id, 2)] = 1 else: for sub_tag in vocab_tags.map_tag_to_sub_tags[tag]: val = vocab_tags.map_sub_to_ids[sub_tag] feature_matrix[(tag_id, val)] = 1 Q = cPickle.load(open(sys.argv[4], 'rb')) print "START COMPARING" word = sys.argv[5] word_id = vocab.lookup_id(word) words = [] for j, q in enumerate(Q): words.append((j, vocab.lookup_word(j), cosine(Q[word_id], q))) words.sort(key=lambda x: x[2]) print words[:20]
def train_lbl(train_data, dev_data, test_data=[], K=20, word_context_sz=2, char_context_sz=2, learning_rate=1.0, rate_update='simple', epochs=10, batch_size=100, rng=None, patience=None, patience_incr=2, improvement_thrs=0.995, validation_freq=1000): """ Train log-bilinear model """ # create vocabulary from train data, plus <s>, </s> vocab = Dictionary.from_corpus(train_data, unk='<unk>') vocab.add_word('<s>') vocab.add_word('</s>') V = vocab.size() # initialize random generator if not provided rng = np.random.RandomState() if not rng else rng # generate (context, target) pairs of word ids train_word_x, train_char_x, train_set_y = make_instances( train_data, vocab, word_context_sz, char_context_sz) dev_word_x, dev_char_x, dev_set_y = make_instances(dev_data, vocab, word_context_sz, char_context_sz) test_word_x, test_char_x, test_set_y = make_instances( test_data, vocab, word_context_sz, char_context_sz) # number of minibatches for training n_train_batches = train_word_x.get_value(borrow=True).shape[0] / batch_size n_dev_batches = dev_word_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_word_x.get_value(borrow=True).shape[0] / batch_size # build the model logger.info("Build the model ...") index = T.lscalar() x_word = T.imatrix('x_word') x_char = T.imatrix('x_char') y = T.ivector('y') # create log-bilinear model lbl = LogBilinearLanguageModel(x_word, x_char, V, K, word_context_sz, char_context_sz, rng) # cost function is negative log likelihood of the training data cost = lbl.negative_log_likelihood(y) # compute the gradient gparams = [] for param in lbl.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameter of the model updates = [] for param, gparam in zip(lbl.params, gparams): updates.append((param, param - learning_rate * gparam)) # function that computes log-probability of the dev set logprob_dev = theano.function( inputs=[index], outputs=cost, givens={ x_word: dev_word_x[index * batch_size:(index + 1) * batch_size], x_char: dev_char_x[index * batch_size:(index + 1) * batch_size], y: dev_set_y[index * batch_size:(index + 1) * batch_size] }) # function that computes log-probability of the test set logprob_test = theano.function( inputs=[index], outputs=cost, givens={ x_word: test_word_x[index * batch_size:(index + 1) * batch_size], x_char: test_char_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) # function that returns the cost and updates the parameter train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x_word: train_word_x[index * batch_size:(index + 1) * batch_size], x_char: train_char_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # perplexity functions def compute_dev_logp(): return np.mean([logprob_dev(i) for i in xrange(n_dev_batches)]) def compute_test_logp(): return np.mean([logprob_test(i) for i in xrange(n_test_batches)]) def ppl(neg_logp): return np.power(2.0, neg_logp) # train model logger.info("training model...") best_params = None last_epoch_dev_ppl = np.inf best_dev_ppl = np.inf test_ppl = np.inf test_core = 0 start_time = time.clock() done_looping = False for epoch in xrange(epochs): if done_looping: break logger.debug('epoch %i' % epoch) for minibatch_index in xrange(n_train_batches): itr = epoch * n_train_batches + minibatch_index train_logp = train_model(minibatch_index) logger.debug( 'epoch %i, minibatch %i/%i, train minibatch log prob %.4f ppl %.4f' % (epoch, minibatch_index + 1, n_train_batches, train_logp, ppl(train_logp))) if (itr + 1) % validation_freq == 0: # compute perplexity on dev set, lower is better dev_logp = compute_dev_logp() dev_ppl = ppl(dev_logp) logger.debug( 'epoch %i, minibatch %i/%i, dev log prob %.4f ppl %.4f' % (epoch, minibatch_index + 1, n_train_batches, dev_logp, ppl(dev_logp))) # if we got the lowest perplexity until now if dev_ppl < best_dev_ppl: # improve patience if loss improvement is good enough if patience and dev_ppl < best_dev_ppl * improvement_thrs: patience = max(patience, itr * patience_incr) best_dev_ppl = dev_ppl test_logp = compute_test_logp() test_ppl = ppl(test_logp) logger.debug( 'epoch %i, minibatch %i/%i, test log prob %.4f ppl %.4f' % (epoch, minibatch_index + 1, n_train_batches, test_logp, ppl(test_logp))) # stop learning if no improvement was seen for a long time if patience and patience <= itr: done_looping = True break # adapt learning rate if rate_update == 'simple': # set learning rate to 1 / (epoch+1) learning_rate = 1.0 / (epoch + 1) elif rate_update == 'adaptive': # half learning rate if perplexity increased at end of epoch (Mnih and Teh 2012) this_epoch_dev_ppl = ppl(compute_dev_logp()) if this_epoch_dev_ppl > last_epoch_dev_ppl: learning_rate /= 2.0 last_epoch_dev_ppl = this_epoch_dev_ppl elif rate_update == 'constant': # keep learning rate constant pass else: raise ValueError("Unknown learning rate update strategy: %s" % rate_update) end_time = time.clock() total_time = end_time - start_time logger.info( 'Optimization complete with best dev ppl of %.4f and test ppl %.4f' % (best_dev_ppl, test_ppl)) logger.info('Training took %d epochs, with %.1f epochs/sec' % (epoch + 1, float(epoch + 1) / total_time)) logger.info("Total training time %d days %d hours %d min %d sec." % (total_time / 60 / 60 / 24, total_time / 60 / 60 % 24, total_time / 60 % 60, total_time % 60)) # return model return lbl
def train_lbl(train_data, dev_data, test_data=[], K=20, context_sz=2, learning_rate=1.0, rate_update='simple', epochs=10, batch_size=100, rng=None, patience=None, patience_incr=2, improvement_thrs=0.995, validation_freq=1000, noise_data_ratio=25): """ Train log-bilinear model with noise contrastive estimation """ # create vocabulary from train data, plus <s>, </s> vocab = Dictionary.from_corpus(train_data, unk='<unk>') vocab.add_word('<s>') vocab.add_word('</s>') V = vocab.size() print vocab.vocab logger.debug("Vocabulary size: %d" % V) # initialize random generator if not provided rng = np.random.RandomState() if not rng else rng # generate (context, target) pairs of word ids train_set_x, train_set_y = make_instances(train_data, vocab, context_sz) dev_set_x, dev_set_y = make_instances(dev_data, vocab, context_sz) test_set_x, test_set_y = make_instances(test_data, vocab, context_sz) # generate noise samples noise_model = UnigramLanguageModel(train_data, vocab) data_sz = train_set_x.shape.eval()[0] noise_set = theano.shared(np.asarray(noise_model.samples(noise_data_ratio * data_sz), dtype=np.int32), borrow=True) # number of minibatches for training n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_dev_batches = dev_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size # build the model logger.info("Build the model ...") index = T.lscalar() x = T.imatrix('x') y = T.ivector('y') noise = T.ivector('noise') # create log-bilinear model lbl = LogBilinearLanguageModelNCE(x, V, K, context_sz, rng) # cost function is the unnormalized log-probability cost = lbl.unnormalized_neg_log_likelihood(y) noise_cost = lbl.unnormalized_neg_log_likelihood(noise) cost_normalized = lbl.negative_log_likelihood(y) # compute gradient gparams = [] noise_gparams = [] for param in lbl.params: gparam = T.grad(cost, param) noise_gparam = T.grad(noise_cost, param) gparams.append(gparam) noise_gparams.append(noise_gparam) # specify NCE objective update step for model parameter updates = [] for param, gparam, noise_gparam in zip(lbl.params, gparams, noise_gparams): # k * P_n(w) / (P_h(w) + k * P_n(w)) nce_weight = noise_data_ratio * noise_model.likelihood(y) / (lbl.unnormalized_neg_log_likelihood(y) + noise_data_ratio*noise_model.likelihood(y)) # nce update # update = nce_weight*gparam update = gparam # debug: just add half of the update # updates.append((param, param-learning_rate*update)) # gradient approximation with noise samples # P_h(w) / (P_h(w) + k * P_n(w)) # noise_weight = lbl.unnormalized_neg_log_likelihood(noise) / (lbl.unnormalized_neg_log_likelihood(noise) + noise_data_ratio*noise_model.likelihood(noise)) # noise_update = noise_weight*noise_gparam noise_update = noise_gparam # # sum over k noise samples noise_update.reshape((noise_data_ratio, y.shape[0])).sum(axis=0) # noise_update.reshape((noise_data_ratio, y.shape[0])).sum(axis=0) # # overall update step on objective function J updates.append((param, param-learning_rate*(update-noise_update))) # function that computes normalized log-probability of the dev set logprob_dev = theano.function(inputs=[index], outputs=cost_normalized, givens={x: dev_set_x[index*batch_size: (index+1)*batch_size], y: dev_set_y[index*batch_size: (index+1)*batch_size] }) # function that computes normalized log-probability of the test set logprob_test = theano.function(inputs=[index], outputs=cost_normalized, givens={x: test_set_x[index*batch_size: (index+1)*batch_size], y: test_set_y[index*batch_size: (index+1)*batch_size] }) # function that returns the unnormalized cost and updates the parameter # debug # return udpate for first paramter (R matrix) # train_model = theano.function(inputs=[index], outputs=nce_weight, # updates=updates, # givens={x: train_set_x[index*batch_size: # (index+1)*batch_size], # y: train_set_y[index*batch_size: # (index+1)*batch_size], # noise: noise_set[index*batch_size*noise_data_ratio: # (index+1)*batch_size*noise_data_ratio] # }, # on_unused_input='warn' # ) train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={x: train_set_x[index*batch_size: (index+1)*batch_size], y: train_set_y[index*batch_size: (index+1)*batch_size], noise: noise_set[index*batch_size*noise_data_ratio: (index+1)*batch_size*noise_data_ratio] }, on_unused_input='warn' ) # train_model = theano.function(inputs=[index], outputs=cost, # givens={x: train_set_x[index*batch_size: # (index+1)*batch_size], # y: train_set_y[index*batch_size: # (index+1)*batch_size], # }) # perplexity functions def compute_dev_logp(): return np.mean([logprob_dev(i) for i in xrange(n_dev_batches)]) def compute_test_logp(): return np.mean([logprob_test(i) for i in xrange(n_test_batches)]) def ppl(neg_logp): return np.power(2.0, neg_logp) # train model logger.info("training model...") best_params = None last_epoch_dev_ppl = np.inf best_dev_ppl = np.inf test_ppl = np.inf test_core = 0 start_time = time.clock() done_looping = False for epoch in xrange(epochs): if done_looping: break logger.debug('epoch %i' % epoch) for minibatch_index in xrange(n_train_batches): itr = epoch * n_train_batches + minibatch_index # tmp = train_model(minibatch_index) # print "shape tmp:", tmp.shape train_logp = train_model(minibatch_index) logger.debug('epoch %i, minibatch %i/%i, train minibatch log prob %.4f ppl %.4f' % (epoch, minibatch_index+1, n_train_batches, train_logp, ppl(train_logp))) if (itr+1) % validation_freq == 0: # compute perplexity on dev set, lower is better dev_logp = compute_dev_logp() dev_ppl = ppl(dev_logp) logger.debug('epoch %i, minibatch %i/%i, dev log prob %.4f ppl %.4f' % (epoch, minibatch_index+1, n_train_batches, dev_logp, ppl(dev_logp))) # if we got the lowest perplexity until now if dev_ppl < best_dev_ppl: # improve patience if loss improvement is good enough if patience and dev_ppl < best_dev_ppl * improvement_thrs: patience = max(patience, itr * patience_incr) best_dev_ppl = dev_ppl test_logp = compute_test_logp() test_ppl = ppl(test_logp) logger.debug('epoch %i, minibatch %i/%i, test log prob %.4f ppl %.4f' % (epoch, minibatch_index+1, n_train_batches, test_logp, ppl(test_logp))) # stop learning if no improvement was seen for a long time if patience and patience <= itr: done_looping = True break # adapt learning rate if rate_update == 'simple': # set learning rate to 1 / (epoch+1) learning_rate = 1.0 / (epoch+1) elif rate_update == 'adaptive': # half learning rate if perplexity increased at end of epoch (Mnih and Teh 2012) this_epoch_dev_ppl = ppl(compute_dev_logp()) if this_epoch_dev_ppl > last_epoch_dev_ppl: learning_rate /= 2.0 last_epoch_dev_ppl = this_epoch_dev_ppl elif rate_update == 'constant': # keep learning rate constant pass else: raise ValueError("Unknown learning rate update strategy: %s" %rate_update) end_time = time.clock() total_time = end_time - start_time logger.info('Optimization complete with best dev ppl of %.4f and test ppl %.4f' % (best_dev_ppl, test_ppl)) logger.info('Training took %d epochs, with %.1f epochs/sec' % (epoch+1, float(epoch+1) / total_time)) logger.info("Total training time %d days %d hours %d min %d sec." % (total_time/60/60/24, total_time/60/60%24, total_time/60%60, total_time%60)) # return model return lbl
def train_lbl(train_data, dev_data, test_data=[], K=20, word_context_sz=2, char_context_sz=2, learning_rate=1.0, rate_update='simple', epochs=10, batch_size=100, rng=None, patience=None, patience_incr=2, improvement_thrs=0.995, validation_freq=1000): """ Train log-bilinear model """ # create vocabulary from train data, plus <s>, </s> vocab = Dictionary.from_corpus(train_data, unk='<unk>') vocab.add_word('<s>') vocab.add_word('</s>') V = vocab.size() # initialize random generator if not provided rng = np.random.RandomState() if not rng else rng # generate (context, target) pairs of word ids train_word_x, train_char_x, train_set_y = make_instances(train_data, vocab, word_context_sz, char_context_sz) dev_word_x, dev_char_x, dev_set_y = make_instances(dev_data, vocab, word_context_sz, char_context_sz) test_word_x, test_char_x, test_set_y = make_instances(test_data, vocab, word_context_sz, char_context_sz) # number of minibatches for training n_train_batches = train_word_x.get_value(borrow=True).shape[0] / batch_size n_dev_batches = dev_word_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_word_x.get_value(borrow=True).shape[0] / batch_size # build the model logger.info("Build the model ...") index = T.lscalar() x_word = T.imatrix('x_word') x_char = T.imatrix('x_char') y = T.ivector('y') # create log-bilinear model lbl = LogBilinearLanguageModel(x_word, x_char, V, K, word_context_sz, char_context_sz, rng) # cost function is negative log likelihood of the training data cost = lbl.negative_log_likelihood(y) # compute the gradient gparams = [] for param in lbl.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameter of the model updates = [] for param, gparam in zip(lbl.params, gparams): updates.append((param, param-learning_rate*gparam)) # function that computes log-probability of the dev set logprob_dev = theano.function(inputs=[index], outputs=cost, givens={x_word: dev_word_x[index*batch_size: (index+1)*batch_size], x_char: dev_char_x[index*batch_size: (index+1)*batch_size], y: dev_set_y[index*batch_size: (index+1)*batch_size] }) # function that computes log-probability of the test set logprob_test = theano.function(inputs=[index], outputs=cost, givens={x_word: test_word_x[index*batch_size: (index+1)*batch_size], x_char: test_char_x[index*batch_size: (index+1)*batch_size], y: test_set_y[index*batch_size: (index+1)*batch_size] }) # function that returns the cost and updates the parameter train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={x_word: train_word_x[index*batch_size: (index+1)*batch_size], x_char: train_char_x[index*batch_size: (index+1)*batch_size], y: train_set_y[index*batch_size: (index+1)*batch_size] }) # perplexity functions def compute_dev_logp(): return np.mean([logprob_dev(i) for i in xrange(n_dev_batches)]) def compute_test_logp(): return np.mean([logprob_test(i) for i in xrange(n_test_batches)]) def ppl(neg_logp): return np.power(2.0, neg_logp) # train model logger.info("training model...") best_params = None last_epoch_dev_ppl = np.inf best_dev_ppl = np.inf test_ppl = np.inf test_core = 0 start_time = time.clock() done_looping = False for epoch in xrange(epochs): if done_looping: break logger.debug('epoch %i' % epoch) for minibatch_index in xrange(n_train_batches): itr = epoch * n_train_batches + minibatch_index train_logp = train_model(minibatch_index) logger.debug('epoch %i, minibatch %i/%i, train minibatch log prob %.4f ppl %.4f' % (epoch, minibatch_index+1, n_train_batches, train_logp, ppl(train_logp))) if (itr+1) % validation_freq == 0: # compute perplexity on dev set, lower is better dev_logp = compute_dev_logp() dev_ppl = ppl(dev_logp) logger.debug('epoch %i, minibatch %i/%i, dev log prob %.4f ppl %.4f' % (epoch, minibatch_index+1, n_train_batches, dev_logp, ppl(dev_logp))) # if we got the lowest perplexity until now if dev_ppl < best_dev_ppl: # improve patience if loss improvement is good enough if patience and dev_ppl < best_dev_ppl * improvement_thrs: patience = max(patience, itr * patience_incr) best_dev_ppl = dev_ppl test_logp = compute_test_logp() test_ppl = ppl(test_logp) logger.debug('epoch %i, minibatch %i/%i, test log prob %.4f ppl %.4f' % (epoch, minibatch_index+1, n_train_batches, test_logp, ppl(test_logp))) # stop learning if no improvement was seen for a long time if patience and patience <= itr: done_looping = True break # adapt learning rate if rate_update == 'simple': # set learning rate to 1 / (epoch+1) learning_rate = 1.0 / (epoch+1) elif rate_update == 'adaptive': # half learning rate if perplexity increased at end of epoch (Mnih and Teh 2012) this_epoch_dev_ppl = ppl(compute_dev_logp()) if this_epoch_dev_ppl > last_epoch_dev_ppl: learning_rate /= 2.0 last_epoch_dev_ppl = this_epoch_dev_ppl elif rate_update == 'constant': # keep learning rate constant pass else: raise ValueError("Unknown learning rate update strategy: %s" %rate_update) end_time = time.clock() total_time = end_time - start_time logger.info('Optimization complete with best dev ppl of %.4f and test ppl %.4f' % (best_dev_ppl, test_ppl)) logger.info('Training took %d epochs, with %.1f epochs/sec' % (epoch+1, float(epoch+1) / total_time)) logger.info("Total training time %d days %d hours %d min %d sec." % (total_time/60/60/24, total_time/60/60%24, total_time/60%60, total_time%60)) # return model return lbl