def SimpleQ_matches_Triple(rel_word_ids_f, rel_word_lens_f): rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape( (batch_size, max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape( (batch_size, max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) # q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool = Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool = Average_Pooling_for_SimpleQA( rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1] + filter_size[1] - 1, dim=max_Q_len + filter_size[1] - 1, topk=2) overall_simi = cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling) return overall_simi
def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f, rel_word_lens_f, men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape( (batch_size, max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape( (batch_size, max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH #q_desH_conv = Conv_with_input_para(rng, input=q_word_input, # image_shape=(batch_size, 1, emb_size, max_Q_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) #desH_conv = Conv_with_input_para(rng, input=desH_word_input, # image_shape=(batch_size, 1, emb_size, max_des_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) ent_conv_pool = Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool = Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2]) #q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool = Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool = Average_Pooling_for_SimpleQA( rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1] + filter_size[1] - 1, dim=max_Q_len + filter_size[1] - 1, topk=2) #q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) #desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)*0.33333+\ cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)*0.55 # 0.0*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling) # cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi
def evaluate_lenet5(file_name, vocab_file, train_file, dev_file, word2vec_file, learning_rate=0.001, n_epochs=2000, nkerns=[90, 90], batch_size=1, window_width=2, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=128, max_d_length=128, margin=0.3): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() f = open(file_name, 'w') f.write("model options " + str(model_options) + '\n') rng = numpy.random.RandomState(23455) train_data, _train_Label, train_size, test_data, _test_Label, test_size, vocab_size = load_MCTest_corpus_DPN( vocab_file, train_file, dev_file, max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test f.write('train_size : ' + str(train_size)) [ train_data_D, train_data_A1, train_Label, train_Length_D, train_Length_D_s, train_Length_A1, train_leftPad_D, train_leftPad_D_s, train_leftPad_A1, train_rightPad_D, train_rightPad_D_s, train_rightPad_A1 ] = train_data [ test_data_D, test_data_A1, test_Label, test_Length_D, test_Length_D_s, test_Length_A1, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1, test_rightPad_D, test_rightPad_D_s, test_rightPad_A1 ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) rand_values = load_word2vec_to_init(rand_values, word2vec_file) embeddings = theano.shared(value=rand_values, borrow=True) error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_A1 = T.lvector() y = T.lscalar() len_D = T.lscalar() len_D_s = T.lvector() len_A1 = T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() left_A1 = T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() right_A1 = T.lscalar() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) ###################### # BUILD ACTUAL MODEL # ###################### f.write('... building the model\n') layer0_D_input = embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para = [conv_W, conv_b] conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para = [conv2_W, conv2_b] high_W, high_b = create_highw_para( rng, nkerns[0], nkerns[1] ) # this part decides nkern[0] and nkern[1] must be in the same dimension highW_para = [high_W, high_b] params = layer2_para + layer0_para + highW_para #+[embeddings] layer0_D = Conv_with_input_para( rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para( rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output') layer1_DA1 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A1 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer2_DA1 = Conv_with_input_para( rng, input=layer1_DA1.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA1.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1_output_sent_rep_Dlevel = debug_print( layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel') layer3_DA1 = Average_Pooling_for_Top( rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) #high-way transform_gate_DA1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1') transform_gate_A1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1') overall_D_A1 = ( 1.0 - transform_gate_DA1 ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep overall_A1 = ( 1.0 - transform_gate_A1 ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel simi_sent_level1 = debug_print( cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1') simi_doc_level1 = debug_print( cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1') simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1') simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0 logistic_w, logistic_b = create_logistic_para(rng, 1, 2) logistic_para = [logistic_w, logistic_b] params += logistic_para simi_1 = T.dot(logistic_w, simi_1) + logistic_b.dimshuffle(0, 'x') simi_1 = simi_1.dimshuffle(1, 0) simi_1 = T.nnet.softmax(simi_1) predict = T.argmax(simi_1, axis=1) tmp = T.log(simi_1) cost = T.maximum(0.0, margin + tmp[0][1 - y] - tmp[0][y]) L2_reg = (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum() + ( logistic_w**2).sum() cost = cost + L2_weight * L2_reg test_model = theano.function( [index], [cost, simi_1, predict], givens={ index_D: test_data_D[index], #a matrix index_A1: test_data_A1[index], y: test_Label[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_A1: test_Length_A1[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_A1: test_leftPad_A1[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_A1: test_rightPad_A1[index], }, on_unused_input='ignore') accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, simi_1, predict], updates=updates, givens={ index_D: train_data_D[index], index_A1: train_data_A1[index], y: train_Label[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_A1: train_Length_A1[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_A1: train_leftPad_A1[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_A1: train_rightPad_A1[index], }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### f.write('... training\n') # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data simi_train = [] predict_train = [] for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 cost_average, simi, predict = train_model(batch_start) simi_train.append(simi) predict_train.append(predict) if iter % 1000 == 0: f.write('@iter :' + str(iter) + '\n') if iter % n_train_batches == 0: corr_train = compute_corr_train(predict_train, _train_Label) res = 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + 'corr rate: ' + str( corr_train * 100.0 / train_size) + '\n' f.write(res) if iter % validation_frequency == 0 or iter % 20000 == 0: posi_test_sent = [] nega_test_sent = [] posi_test_doc = [] nega_test_doc = [] posi_test_overall = [] nega_test_overall = [] simi_test = [] predict_test = [] for i in test_batch_start: cost, simi, predict = test_model(i) #print simi #f.write('test_predict : ' + str(predict) + ' test_simi : ' + str(simi) + '\n' ) simi_test.append(simi) predict_test.append(predict) corr_test = compute_corr(simi_test, predict_test, f) test_acc = corr_test * 1.0 / (test_size / 4.0) res = '\t\t\tepoch ' + str(epoch) + ', minibatch ' + str( minibatch_index) + ' / ' + str( n_train_batches) + ' test acc of best model ' + str( test_acc * 100.0) + '\n' f.write(res) find_better = False if test_acc > max_acc: max_acc = test_acc best_epoch = epoch find_better = True res = '\t\t\tmax: ' + str(max_acc) + ' (at ' + str( best_epoch) + ')\n' f.write(res) if find_better == True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[90, 90], batch_size=1, window_width=2, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=0.2): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/' rng = numpy.random.RandomState(23455) train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DSSSS( rootPath + 'vocab_DSSSS.txt', rootPath + 'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt', rootPath + 'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt', max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label), # numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4), # numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4), # numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)] # return results, line_control [ train_data_D, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, train_Length_D, train_Length_D_s, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4, train_leftPad_D, train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4, train_rightPad_D, train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4 ] = train_data [ test_data_D, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, test_Length_D, test_Length_D_s, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4, test_rightPad_D, test_rightPad_D_s, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4 ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() index_A1 = T.lvector() index_A2 = T.lvector() index_A3 = T.lvector() index_A4 = T.lvector() # y = T.lvector() len_D = T.lscalar() len_D_s = T.lvector() # len_Q=T.lscalar() len_A1 = T.lscalar() len_A2 = T.lscalar() len_A3 = T.lscalar() len_A4 = T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() # left_Q=T.lscalar() left_A1 = T.lscalar() left_A2 = T.lscalar() left_A3 = T.lscalar() left_A4 = T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() # right_Q=T.lscalar() right_A1 = T.lscalar() right_A2 = T.lscalar() right_A3 = T.lscalar() right_A4 = T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A4_input = embeddings[index_A4.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para = [conv_W, conv_b] conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para = [conv2_W, conv2_b] high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1]) highW_para = [high_W, high_b] params = layer2_para + layer0_para + highW_para #+[embeddings] #load_model(params) layer0_D = Conv_with_input_para( rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para( rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A2 = Conv_with_input_para( rng, input=layer0_A2_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A3 = Conv_with_input_para( rng, input=layer0_A3_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A4 = Conv_with_input_para( rng, input=layer0_A4_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') # layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output') layer0_A2_output = debug_print(layer0_A2.output, 'layer0_A2.output') layer0_A3_output = debug_print(layer0_A3.output, 'layer0_A3.output') layer0_A4_output = debug_print(layer0_A4.output, 'layer0_A4.output') # layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA1 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A1 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer1_DA2 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A2 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer1_DA3 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A3 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer1_DA4 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A4 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc # layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA1 = Conv_with_input_para( rng, input=layer1_DA1.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA2 = Conv_with_input_para( rng, input=layer1_DA2.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA3 = Conv_with_input_para( rng, input=layer1_DA3.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA4 = Conv_with_input_para( rng, input=layer1_DA4.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights # layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA1.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A2 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA2.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A3 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA3.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A4 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA4.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A1_output_sent_rep_Dlevel = debug_print( layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel') layer2_A2_output_sent_rep_Dlevel = debug_print( layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel') layer2_A3_output_sent_rep_Dlevel = debug_print( layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel') layer2_A4_output_sent_rep_Dlevel = debug_print( layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel') # layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA1 = Average_Pooling_for_Top( rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) layer3_DA2 = Average_Pooling_for_Top( rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) layer3_DA3 = Average_Pooling_for_Top( rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) layer3_DA4 = Average_Pooling_for_Top( rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) #high-way # transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1') transform_gate_DA2 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b), 'transform_gate_DA2') transform_gate_DA3 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b), 'transform_gate_DA3') transform_gate_DA4 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b), 'transform_gate_DA4') # transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q') transform_gate_A1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1') transform_gate_A2 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2') transform_gate_A3 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3') transform_gate_A4 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4') # overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A1 = ( 1.0 - transform_gate_DA1 ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep overall_D_A2 = ( 1.0 - transform_gate_DA2 ) * layer1_DA2.output_D_sent_level_rep + transform_gate_DA2 * layer3_DA2.output_D_doc_level_rep overall_D_A3 = ( 1.0 - transform_gate_DA3 ) * layer1_DA3.output_D_sent_level_rep + transform_gate_DA3 * layer3_DA3.output_D_doc_level_rep overall_D_A4 = ( 1.0 - transform_gate_DA4 ) * layer1_DA4.output_D_sent_level_rep + transform_gate_DA4 * layer3_DA4.output_D_doc_level_rep # overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel overall_A1 = ( 1.0 - transform_gate_A1 ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel overall_A2 = ( 1.0 - transform_gate_A2 ) * layer1_DA2.output_QA_sent_level_rep + transform_gate_A2 * layer2_A2.output_sent_rep_Dlevel overall_A3 = ( 1.0 - transform_gate_A3 ) * layer1_DA3.output_QA_sent_level_rep + transform_gate_A3 * layer2_A3.output_sent_rep_Dlevel overall_A4 = ( 1.0 - transform_gate_A4 ) * layer1_DA4.output_QA_sent_level_rep + transform_gate_A4 * layer2_A4.output_sent_rep_Dlevel simi_sent_level1 = debug_print( cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1') simi_sent_level2 = debug_print( cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2') simi_sent_level3 = debug_print( cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3') simi_sent_level4 = debug_print( cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4') simi_doc_level1 = debug_print( cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1') simi_doc_level2 = debug_print( cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2') simi_doc_level3 = debug_print( cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3') simi_doc_level4 = debug_print( cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4') simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1') simi_overall_level2 = debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2') simi_overall_level3 = debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3') simi_overall_level4 = debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4') simi_1 = simi_overall_level1 #+simi_sent_level1+simi_doc_level1 simi_2 = simi_overall_level2 #+simi_sent_level2+simi_doc_level2 simi_3 = simi_overall_level3 #+simi_sent_level3+simi_doc_level3 simi_4 = simi_overall_level4 #+simi_sent_level4+simi_doc_level4 # simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0 # simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0 # simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0 # simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0 # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) # #only use overall_simi # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_overall_level1 # nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4]) #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # cost=T.maximum(0.0, margin+simi_2-simi_1)+T.maximum(0.0, margin+simi_3-simi_1)+T.maximum(0.0, margin+simi_4-simi_1) cost12 = T.maximum( 0.0, margin + simi_sent_level2 - simi_sent_level1) + T.maximum( 0.0, margin + simi_doc_level2 - simi_doc_level1) + T.maximum( 0.0, margin + simi_overall_level2 - simi_overall_level1) cost13 = T.maximum( 0.0, margin + simi_sent_level3 - simi_sent_level1) + T.maximum( 0.0, margin + simi_doc_level3 - simi_doc_level1) + T.maximum( 0.0, margin + simi_overall_level3 - simi_overall_level1) cost14 = T.maximum( 0.0, margin + simi_sent_level4 - simi_sent_level1) + T.maximum( 0.0, margin + simi_doc_level4 - simi_doc_level1) + T.maximum( 0.0, margin + simi_overall_level4 - simi_overall_level1) cost = cost12 + cost13 + cost14 posi_simi = T.max([simi_sent_level1, simi_doc_level1, simi_overall_level1]) nega_simi = T.max([ simi_sent_level2, simi_doc_level2, simi_overall_level2, simi_sent_level3, simi_doc_level3, simi_overall_level3, simi_sent_level4, simi_doc_level4, simi_overall_level4 ]) L2_reg = debug_print( (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(), 'L2_reg' ) #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost = debug_print(cost + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: test_data_D[index], #a matrix # index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], # len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], # left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], # right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index], right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # acc = acc_i + T.sqr(grad_i) # if param_i == embeddings: # updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size))))) #AdaGrad # else: # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, posi_simi, nega_simi], updates=updates, givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data corr_train = 0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 sys.stdout.write("Training :[%6f] %% complete!\r" % ((iter % train_size) * 100.0 / train_size)) sys.stdout.flush() minibatch_index = minibatch_index + 1 cost_average, posi_simi, nega_simi = train_model(batch_start) if posi_simi > nega_simi: corr_train += 1 if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + 'corr rate:' + str( corr_train * 100.0 / train_size) if iter % validation_frequency == 0: corr_test = 0 for i in test_batch_start: cost, posi_simi, nega_simi = test_model(i) if posi_simi > nega_simi: corr_test += 1 #write_file.close() #test_score = numpy.mean(test_losses) test_acc = corr_test * 1.0 / test_size #test_acc=1-test_score print( ('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better = False if test_acc > max_acc: max_acc = test_acc best_epoch = epoch find_better = True print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')' if find_better == True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=7, maxSentLength=60, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00005, update_freq=10, norm_threshold=5.0): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/' rng = numpy.random.RandomState(23455) datasets, vocab_size = load_msr_corpus(rootPath + 'vocab.txt', rootPath + 'tokenized_train.txt', rootPath + 'tokenized_test.txt', maxSentLength) mtPath = '/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test = load_mts(mtPath + 'concate_15mt_train.txt', mtPath + 'concate_15mt_test.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[ 0] indices_train_l = indices_train[::2, :] indices_train_r = indices_train[1::2, :] trainLengths_l = trainLengths[::2] trainLengths_r = trainLengths[1::2] normalized_train_length_l = normalized_train_length[::2] normalized_train_length_r = normalized_train_length[1::2] trainLeftPad_l = trainLeftPad[::2] trainLeftPad_r = trainLeftPad[1::2] trainRightPad_l = trainRightPad[::2] trainRightPad_r = trainRightPad[1::2] indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[ 1] indices_test_l = indices_test[::2, :] indices_test_r = indices_test[1::2, :] testLengths_l = testLengths[::2] testLengths_r = testLengths[1::2] normalized_test_length_l = normalized_test_length[::2] normalized_test_length_r = normalized_test_length[1::2] testLeftPad_l = testLeftPad[::2] testLeftPad_r = testLeftPad[1::2] testRightPad_l = testRightPad[::2] testRightPad_r = testRightPad[1::2] n_train_batches = indices_train_l.shape[0] / batch_size n_test_batches = indices_test_l.shape[0] / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_l = theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r = theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l = theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r = theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l = T.cast(indices_train_l, 'int32') indices_train_r = T.cast(indices_train_r, 'int32') indices_test_l = T.cast(indices_test_l, 'int32') indices_test_r = T.cast(indices_test_r, 'int32') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size)) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) cost_tmp = 0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.imatrix( 'x_index_l') # now, x is the index matrix, must be integer x_index_r = T.imatrix('x_index_r') y = T.ivector('y') left_l = T.iscalar() right_l = T.iscalar() left_r = T.iscalar() right_r = T.iscalar() length_l = T.iscalar() length_r = T.iscalar() norm_length_l = T.dscalar() norm_length_r = T.dscalar() mts = T.dmatrix() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size = (emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv = ishape[1] + filter_size[1] - 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output = debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output = debug_print(layer0_r.output, 'layer0_r.output') layer0_para = [conv_W, conv_b] layer1 = Average_Pooling(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l + filter_size[1] - 1, length_r=length_r + filter_size[1] - 1, dim=maxSentLength + filter_size[1] - 1, window_size=window_width, maxSentLength=maxSentLength) conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1])) layer2_l = Conv_with_input_para(rng, input=layer1.output_tensor_l, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b) layer2_r = Conv_with_input_para(rng, input=layer1.output_tensor_r, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b) layer2_para = [conv2_W, conv2_b] layer3 = Average_Pooling_for_batch1(rng, input_l=layer2_l.output, input_r=layer2_r.output, kern=nkerns[1], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l + filter_size[1] - 1, length_r=length_r + filter_size[1] - 1, dim=maxSentLength + filter_size[1] - 1) layer3_out = debug_print(layer3.output_simi, 'layer1_out') #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l = T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) #norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r = T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) #norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) ''' uni_cosine=cosine(sum_uni_l, sum_uni_r) linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1 = 1.0 / (1.0 + EUCLID(sum_uni_l, sum_uni_r)) #25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l = norm_length_l.reshape((1, 1)) len_r = norm_length_r.reshape((1, 1)) #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer4_input = T.concatenate( [mts, eucli_1, layer1.output_eucli, layer3_out, len_l, len_r], axis=1) #, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer4 = LogisticRegression(rng, input=layer4_input, n_in=15 + 3 + 2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (layer4.W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(), 'L2_reg') #+(layer1.W** 2).sum() cost_this = debug_print(layer4.negative_log_likelihood(y), 'cost_this') #+L2_weight*L2_reg cost = debug_print( (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost') test_model = theano.function( [index], [layer4.errors(y), layer4.y_pred], givens={ x_index_l: indices_test_l[index:index + batch_size], x_index_r: indices_test_r[index:index + batch_size], y: testY[index:index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index:index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer4.params + layer2_para + layer0_para # + layer1.params accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') #norm=T.sqrt((grad_i**2).sum()) #if T.lt(norm_threshold, norm): # print 'big norm' # grad_i=grad_i*(norm_threshold/norm) acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, layer4.errors(y), layer4_input], updates=updates, givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost_this, layer4.errors(y)], givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) if iter % update_freq != 0: cost_ij, error_ij = train_model_predict(batch_start) #print 'cost_ij: ', cost_ij cost_tmp += cost_ij error_sum += error_ij else: cost_average, error_ij, layer3_input = train_model(batch_start) #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq) error_sum = 0 cost_tmp = 0 #reset for the next batch #print layer3_input #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + ' error: ' + str( error_sum) + '/' + str( update_freq) + ' error rate: ' + str( error_sum * 1.0 / update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses = [] for i in test_batch_start: test_loss, pred_y = test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) print(( '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' #print 'validating & testing...' # compute zero-one loss on validation set validation_losses = [] for i in dev_batch_start: time.sleep(0.5) validation_losses.append(validate_model(i)) #validation_losses = [validate_model(i) for i in dev_batch_start] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.09, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/' rng = numpy.random.RandomState(23455) train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus( rootPath + 'vocab.txt', rootPath + 'mc500.train.tsv_standardlized.txt', rootPath + 'mc500.test.tsv_standardlized.txt', max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') [ train_data_D, train_data_Q, train_data_A, train_Y, train_Label, train_Length_D, train_Length_D_s, train_Length_Q, train_Length_A, train_leftPad_D, train_leftPad_D_s, train_leftPad_Q, train_leftPad_A, train_rightPad_D, train_rightPad_D_s, train_rightPad_Q, train_rightPad_A ] = train_data [ test_data_D, test_data_Q, test_data_A, test_Y, test_Label, test_Length_D, test_Length_D_s, test_Length_Q, test_Length_A, test_leftPad_D, test_leftPad_D_s, test_leftPad_Q, test_leftPad_A, test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A = T.lvector() y = T.lvector() len_D = T.lscalar() len_D_s = T.lvector() len_Q = T.lscalar() len_A = T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() left_Q = T.lscalar() left_A = T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() right_Q = T.lscalar() right_A = T.lscalar() #wmf=T.dmatrix() cost_tmp = T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_Q_input = embeddings[index_Q.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A_input = embeddings[index_A.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) # load_model_for_conv1([conv_W, conv_b]) layer0_D = Conv_with_input_para( rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_Q = Conv_with_input_para( rng, input=layer0_Q_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A = Conv_with_input_para( rng, input=layer0_A_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') layer0_Q_output = debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A_output = debug_print(layer0_A.output, 'layer0_A.output') layer0_para = [conv_W, conv_b] layer1_DQ = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_Q + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) layer1_DA = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A, right_r=right_A, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=3) conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc layer2_DQ = Conv_with_input_para( rng, input=layer1_DQ.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA = Conv_with_input_para( rng, input=layer1_DA.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights layer2_Q = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DQ.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_Q_output_sent_rep_Dlevel = debug_print( layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A_output_sent_rep_Dlevel = debug_print( layer2_A.output_sent_rep_Dlevel, 'layer2_A.output_sent_rep_Dlevel') layer2_para = [conv2_W, conv2_b] layer3_DQ = Average_Pooling_for_Top( rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) layer3_DA = Average_Pooling_for_Top( rng, input_l=layer2_DA.output, input_r=layer2_A_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=3) #high-way high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1]) transform_gate_DQ = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA.output_D_sent_level_rep) + high_b), 'transform_gate_DA') transform_gate_Q = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q') transform_gate_A = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA.output_QA_sent_level_rep) + high_b), 'transform_gate_A') highW_para = [high_W, high_b] overall_D_Q = debug_print( (1.0 - transform_gate_DQ) * layer1_DQ.output_D_sent_level_rep + transform_gate_DQ * layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A = ( 1.0 - transform_gate_DA ) * layer1_DA.output_D_sent_level_rep + transform_gate_DA * layer3_DA.output_D_doc_level_rep overall_Q = ( 1.0 - transform_gate_Q ) * layer1_DQ.output_QA_sent_level_rep + transform_gate_Q * layer2_Q.output_sent_rep_Dlevel overall_A = ( 1.0 - transform_gate_A ) * layer1_DA.output_QA_sent_level_rep + transform_gate_A * layer2_A.output_sent_rep_Dlevel simi_sent_level = debug_print( cosine( layer1_DQ.output_D_sent_level_rep + layer1_DA.output_D_sent_level_rep, layer1_DQ.output_QA_sent_level_rep + layer1_DA.output_QA_sent_level_rep), 'simi_sent_level') simi_doc_level = debug_print( cosine( layer3_DQ.output_D_doc_level_rep + layer3_DA.output_D_doc_level_rep, layer2_Q.output_sent_rep_Dlevel + layer2_A.output_sent_rep_Dlevel), 'simi_doc_level') simi_overall_level = debug_print( cosine(overall_D_Q + overall_D_A, overall_Q + overall_A), 'simi_overall_level') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) layer4_input = debug_print( T.concatenate([simi_sent_level, simi_doc_level, simi_overall_level], axis=1), 'layer4_input') #, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer4 = LogisticRegression(rng, input=layer4_input, n_in=3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (layer4.W**2).sum() + (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(), 'L2_reg') #+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this = debug_print(layer4.negative_log_likelihood(y), 'cost_this') #+L2_weight*L2_reg cost = debug_print( (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') # # [train_data_D, train_data_Q, train_data_A, train_Y, train_Label, # train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A, # train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A, # train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data # [test_data_D, test_data_Q, test_data_A, test_Y, test_Label, # test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A, # test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A, # test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data # index = T.lscalar() # index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() # index_A= T.lvector() # # y = T.lvector() # len_D=T.lscalar() # len_D_s=T.lvector() # len_Q=T.lscalar() # len_A=T.lscalar() # # left_D=T.lscalar() # left_D_s=T.lvector() # left_Q=T.lscalar() # left_A=T.lscalar() # # right_D=T.lscalar() # right_D_s=T.lvector() # right_Q=T.lscalar() # right_A=T.lscalar() # # # #wmf=T.dmatrix() # cost_tmp=T.dscalar() test_model = theano.function( [index], [layer4.errors(y), layer4_input, y, layer4.prop_for_posi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A: test_data_A[index], y: test_Y[index:index + batch_size], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A: test_Length_A[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A: test_leftPad_A[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A: test_rightPad_A[index] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer4.params + layer2_para + layer0_para + highW_para accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index, cost_tmp], cost, updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A: train_data_A[index], y: train_Y[index:index + batch_size], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A: train_Length_A[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A: train_leftPad_A[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A: train_rightPad_A[index] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost_this, layer4.errors(y), layer4_input, y], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A: train_data_A[index], y: train_Y[index:index + batch_size], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A: train_Length_A[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A: train_leftPad_A[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A: train_rightPad_A[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data cost_tmp = 0.0 # readfile=open('/mounts/data/proj/wenpeng/Dataset/SICK/train_plus_dev.txt', 'r') # train_pairs=[] # train_y=[] # for line in readfile: # tokens=line.strip().split('\t') # listt=tokens[0]+'\t'+tokens[1] # train_pairs.append(listt) # train_y.append(tokens[2]) # readfile.close() # writefile=open('/mounts/data/proj/wenpeng/Dataset/SICK/weights_fine_tune.txt', 'w') for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 sys.stdout.write("Training :[%6f] %% complete!\r" % (batch_start * 100.0 / train_size)) sys.stdout.flush() minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter % update_freq != 0: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) #print 'layer3_input', layer3_input cost_tmp += cost_ij error_sum += error_ij else: cost_average = train_model(batch_start, cost_tmp) #print 'layer3_input', layer3_input error_sum = 0 cost_tmp = 0.0 #reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str(cost_average) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses = [] test_y = [] test_features = [] test_prop = [] for i in test_batch_start: test_loss, layer3_input, y, posi_prop = test_model(i) test_prop.append(posi_prop[0][0]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() #test_score = numpy.mean(test_losses) test_acc = compute_test_acc(test_y, test_prop) #test_acc=1-test_score print( ('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y = [] train_features = [] count = 0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC( kernel='linear' ) #OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results = clf.decision_function(test_features) lr = linear_model.LogisticRegression(C=1e5) lr.fit(train_features, train_y) results_lr = lr.decision_function(test_features) acc_svm = compute_test_acc(test_y, results) acc_lr = compute_test_acc(test_y, results_lr) find_better = False if acc_svm > max_acc: max_acc = acc_svm best_epoch = epoch find_better = True if test_acc > max_acc: max_acc = test_acc best_epoch = epoch find_better = True if acc_lr > max_acc: max_acc = acc_lr best_epoch = epoch find_better = True print '\t\t\tsvm:', acc_svm, 'lr:', acc_lr, 'nn:', test_acc, 'max:', max_acc, '(at', best_epoch, ')' # if find_better==True: # store_model_to_file(layer2_para, best_epoch) # print 'Finished storing best conv params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.06, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=[4, 4], maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=40): maxSentLength = max_truncate + 2 * (window_width[0] - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/' rng = numpy.random.RandomState(23455) datasets, vocab_size = load_wikiQA_corpus( rootPath + 'vocab.txt', rootPath + 'WikiQA-train.txt', rootPath + 'test_filtered.txt', max_truncate, maxSentLength) #vocab_size contain train, dev and test #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test mtPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test = load_mts_wikiQA( mtPath + 'result_train/concate_2mt_train.txt', mtPath + 'result_test/concate_2mt_test.txt') wm_train, wm_test = load_wmf_wikiQA( rootPath + 'train_word_matching_scores.txt', rootPath + 'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[ 0] indices_train_l = indices_train[::2, :] indices_train_r = indices_train[1::2, :] trainLengths_l = trainLengths[::2] trainLengths_r = trainLengths[1::2] normalized_train_length_l = normalized_train_length[::2] normalized_train_length_r = normalized_train_length[1::2] trainLeftPad_l = trainLeftPad[::2] trainLeftPad_r = trainLeftPad[1::2] trainRightPad_l = trainRightPad[::2] trainRightPad_r = trainRightPad[1::2] indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[ 1] indices_test_l = indices_test[::2, :] indices_test_r = indices_test[1::2, :] testLengths_l = testLengths[::2] testLengths_r = testLengths[1::2] normalized_test_length_l = normalized_test_length[::2] normalized_test_length_r = normalized_test_length[1::2] testLeftPad_l = testLeftPad[::2] testLeftPad_r = testLeftPad[1::2] testRightPad_l = testRightPad[::2] testRightPad_r = testRightPad[1::2] n_train_batches = indices_train_l.shape[0] / batch_size n_test_batches = indices_test_l.shape[0] / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_l = theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r = theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l = theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r = theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l = T.cast(indices_train_l, 'int64') indices_train_r = T.cast(indices_train_r, 'int64') indices_test_l = T.cast(indices_test_l, 'int64') indices_test_r = T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix( 'x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l = T.lscalar() right_l = T.lscalar() left_r = T.lscalar() right_r = T.lscalar() length_l = T.lscalar() length_r = T.lscalar() norm_length_l = T.dscalar() norm_length_r = T.dscalar() mts = T.dmatrix() wmf = T.dmatrix() cost_tmp = T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size = (emb_size, window_width[0]) filter_size_2 = (nkerns[0], window_width[1]) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv = ishape[1] + filter_size[1] - 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) load_model_from_file([conv_W, conv_b]) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output = debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output = debug_print(layer0_r.output, 'layer0_r.output') layer0_para = [conv_W, conv_b] layer1 = Average_Pooling(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l + filter_size[1] - 1, length_r=length_r + filter_size[1] - 1, dim=maxSentLength + filter_size[1] - 1, window_size=window_width[0], maxSentLength=maxSentLength) conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1])) #load_model_from_file([conv2_W, conv2_b]) layer2_l = Conv_with_input_para( rng, input=layer1.output_tensor_l, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]), W=conv2_W, b=conv2_b) layer2_r = Conv_with_input_para( rng, input=layer1.output_tensor_r, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]), W=conv2_W, b=conv2_b) layer2_para = [conv2_W, conv2_b] layer3 = Average_Pooling_for_Top(rng, input_l=layer2_l.output, input_r=layer2_r.output, kern=nkerns[1], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l + filter_size_2[1] - 1, length_r=length_r + filter_size_2[1] - 1, dim=maxSentLength + filter_size_2[1] - 1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l = T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) aver_uni_l = sum_uni_l / layer0_l_input.shape[3] norm_uni_l = sum_uni_l / T.sqrt((sum_uni_l**2).sum()) sum_uni_r = T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) aver_uni_r = sum_uni_r / layer0_r_input.shape[3] norm_uni_r = sum_uni_r / T.sqrt((sum_uni_r**2).sum()) uni_cosine = cosine(sum_uni_l, sum_uni_r) aver_uni_cosine = cosine(aver_uni_l, aver_uni_r) uni_sigmoid_simi = debug_print( T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1, 1)), 'uni_sigmoid_simi') ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1 = 1.0 / (1.0 + EUCLID(sum_uni_l, sum_uni_r)) #25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l = norm_length_l.reshape((1, 1)) len_r = norm_length_r.reshape((1, 1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input = T.concatenate( [ #mts, uni_cosine, #eucli_1_exp,#uni_sigmoid_simi, #norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # layer1. output_cosine, #layer1.output_eucli_to_simi_exp,#layer1.output_sigmoid_simi,#layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # layer3.output_cosine, len_l, len_r, wmf ], axis=1) #, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3 = LogisticRegression(rng, input=layer3_input, n_in=(1) + (1) + (1) + 2 + 2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (layer3.W**2).sum() + (conv2_W**2).sum(), 'L2_reg' ) #+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this = debug_print(layer3.negative_log_likelihood(y), 'cost_this') #+L2_weight*L2_reg cost = debug_print( (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [layer3.prop_for_posi, layer3_input, y], givens={ x_index_l: indices_test_l[index:index + batch_size], x_index_r: indices_test_r[index:index + batch_size], y: testY[index:index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index:index + batch_size], wmf: wm_test[index:index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params + layer2_para #+layer0_para accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [index, cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], wmf: wm_train[index:index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost_this, layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], wmf: wm_train[index:index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches / 5, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False svm_max = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data cost_tmp = 0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter % update_freq != 0: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) #print 'layer3_input', layer3_input cost_tmp += cost_ij error_sum += error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average = train_model(batch_start, cost_tmp) #print 'layer3_input', layer3_input error_sum = 0 cost_tmp = 0.0 #reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + ' error: ' + str( error_sum) + '/' + str( update_freq) + ' error rate: ' + str( error_sum * 1.0 / update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_probs = [] test_y = [] test_features = [] for i in test_batch_start: prob_i, layer3_input, y = test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_probs.append(prob_i[0][0]) test_y.append(y[0]) test_features.append(layer3_input[0]) MAP, MRR = compute_map_mrr(rootPath + 'test_filtered.txt', test_probs) #now, check MAP and MRR print( ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best ' 'model %f, MRR %f') % (epoch, minibatch_index, n_train_batches, MAP, MRR)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y = [] train_features = [] count = 0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(C=1.0, kernel='linear') clf.fit(train_features, train_y) results_svm = clf.decision_function(test_features) MAP_svm, MRR_svm = compute_map_mrr( rootPath + 'test_filtered.txt', results_svm) lr = LinearRegression().fit(train_features, train_y) results_lr = lr.predict(test_features) MAP_lr, MRR_lr = compute_map_mrr( rootPath + 'test_filtered.txt', results_lr) print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr if patience <= iter: done_looping = True break #after each epoch, increase the batch_size if epoch % 2 == 1: update_freq = update_freq * 1 else: update_freq = update_freq / 1 #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.08, n_epochs=2000, nkerns=[44], batch_size=1, window_width=3, maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=24): maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_nonoverlap.txt', rootPath+'train_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate,maxSentLength, entailment=True)#vocab_size contain train, dev and test #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') extra_train, extra_test=load_extra_features(rootPath+'train_rule_features_cosine_eucli_negation_len1_len2.txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2.txt') discri_train, discri_test=load_extra_features(rootPath+'train_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_nonoverlap_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l=T.lscalar() right_l=T.lscalar() left_r=T.lscalar() right_r=T.lscalar() length_l=T.lscalar() length_r=T.lscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() mts=T.dmatrix() extra=T.dmatrix() discri=T.dmatrix() #wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) aver_uni_l=sum_uni_l/layer0_l_input.shape[3] norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) aver_uni_r=sum_uni_r/layer0_r_input.shape[3] norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') linear=Linear(norm_uni_l, norm_uni_r) poly=Poly(norm_uni_l, norm_uni_r) sigmoid=Sigmoid(norm_uni_l, norm_uni_r) rbf=RBF(norm_uni_l, norm_uni_r) gesd=GESD(norm_uni_l, norm_uni_r) eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input=T.concatenate([mts, eucli_1,uni_cosine,#linear, poly,sigmoid,rbf, gesd, #sum_uni_r-sum_uni_l, layer1.output_eucli_to_simi,layer1.output_cosine, #layer1.output_vector_r-layer1.output_vector_l, len_l, len_r, extra #discri #wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3=LogisticRegression(rng, input=layer3_input, n_in=14+(2)+(2)+2+5, n_out=3) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [layer3.errors(y),layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index: index + batch_size], extra: extra_test[index: index + batch_size], discri:discri_test[index: index + batch_size] #wmf: wm_test[index: index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params params_conv = [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] test_y=[] test_features=[] for i in test_batch_start: test_loss, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc=1-test_score print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches,test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) #lr=LinearRegression().fit(train_features, train_y) #results_lr=lr.predict(test_features) corr_count=0 #corr_lr=0 corr_neu=0 neu_co=0 corr_ent=0 ent_co=0 corr_contr=0 contr_co=0 test_size=len(test_y) for i in range(test_size): if test_y[i]==0:#NEUTRAL neu_co+=1 if results[i]==test_y[i]: corr_neu+=1 elif test_y[i]==1:#ENTAILMENT ent_co+=1 if results[i]==test_y[i]: corr_ent+=1 elif test_y[i]==2:#CONTRADICTION contr_co+=1 if results[i]==test_y[i]: corr_contr+=1 ''' if results[i]==test_y[i]: corr_count+=1 if test_y[i]==0: #NEUTRAL corr_neu+=1 ''' #if numpy.absolute(results_lr[i]-test_y[i])<0.5: # corr_lr+=1 corr_count=corr_neu+corr_ent+corr_contr acc=corr_count*1.0/test_size acc_neu=corr_neu*1.0/neu_co acc_ent=corr_ent*1.0/ent_co acc_contr=corr_contr*1.0/contr_co #acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_epoch=epoch if test_acc > max_acc: max_acc=test_acc best_epoch=epoch #if acc_lr> max_acc: # max_acc=acc_lr # best_epoch=epoch print '\t\t\tsvm acc: ', acc, ' max acc: ', max_acc,'(at',best_epoch,')',' Neu: ',acc_neu, ' Ent: ',acc_ent, ' Contr: ',acc_contr if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para=[conv_W, conv_b] conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para=[conv2_W, conv2_b] high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # this part decides nkern[0] and nkern[1] must be in the same dimension highW_para=[high_W, high_b] params = layer2_para+layer0_para+highW_para#+[embeddings] layer0_D = Conv_with_input_para(rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para(rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output') layer0_A1_output=debug_print(layer0_A1.output, 'layer0_A1.output') layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output') layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output')
def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f, rel_word_lens_f, desH_word_ids_f, desH_word_lens_f, desT_word_ids_f, desT_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids.flatten()].reshape( (batch_size, max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape( (batch_size, max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape( (batch_size, max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape( (batch_size, max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids.flatten()].reshape( (batch_size, max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH q_desH_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) desH_conv = Conv_with_input_para(rng, input=desH_word_input, image_shape=(batch_size, 1, emb_size, max_des_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) #q_desT q_desT_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) desT_conv = Conv_with_input_para(rng, input=desT_word_input, image_shape=(batch_size, 1, emb_size, max_des_len), filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) # ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output') # men_char_output=debug_print(men_char_conv.output, 'men_char.output') ent_conv_pool = Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool = Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens[0], right_l=men_lens[2]) q_rel_pool = Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) rel_conv_pool = Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_desH_pool = Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) desH_conv_pool = Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) q_desT_pool = Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) desT_conv_pool = Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2]) overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\ cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)+\ cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling)+\ cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3, maxSentLength=30, emb_size=300, hidden_size=[300,10], margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) vocab_size=len(word2id)+1 mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2] indices_train_r=indices_train[1::2] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2] indices_test_r=indices_test[1::2] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] train_size = len(indices_train_l) test_size = len(indices_test_l) train_batch_start=range(train_size) test_batch_start=range(test_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int32') # indices_train_r=T.cast(indices_train_r, 'int32') # indices_test_l=T.cast(indices_test_l, 'int32') # indices_test_r=T.cast(indices_test_r, 'int32') rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng) # rand_values[0]=numpy.array(numpy.zeros(emb_size)) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec) embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.iscalar() x_index_l = T.imatrix() # now, x is the index matrix, must be integer x_index_r = T.imatrix() y = T.ivector() left_l=T.iscalar() right_l=T.iscalar() left_r=T.iscalar() right_r=T.iscalar() length_l=T.iscalar() length_r=T.iscalar() norm_length_l=T.fscalar() norm_length_r=T.fscalar() mts=T.fmatrix() wmf=T.fmatrix() # cost_tmp=T.fscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0])) layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0])) layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size)) norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size)) norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts HL_layer_1_input=T.concatenate([ # mts, eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # uni_cosine, # sum_uni_l, # sum_uni_r, # sum_uni_l+sum_uni_r, 1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)), cosine(layer0_l_output_maxpool, layer0_r_output_maxpool), layer0_l_output_maxpool, layer0_r_output_maxpool, T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10), layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # layer1.output_cosine, layer1.output_vector_l, layer1.output_vector_r, T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10), # len_l, len_r layer1.output_attentions # wmf, ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input, mts, len_l, len_r # wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_size=1+1+ 1+1+3* nkerns[0] +1+1+3*nkerns[0]+10*10 HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh) LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1) LR_layer_input_with_extra=T.concatenate([HL_layer_2.output, HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output, LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2) # LR_layer_input=HL_layer_2.output # LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2) # layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum() # diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix) cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True) params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): clipped_grad = T.clip(grad_i, -0.5, 0.5) acc = acc_i + T.sqr(clipped_grad) updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True) train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() epoch = 0 done_looping = False max_acc=0.0 nn_max_acc=0.0 best_iter=0 cost_tmp=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data for index in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * train_size + minibatch_index +1 minibatch_index=minibatch_index+1 # if iter%update_freq != 0: # cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) # #print 'cost_ij: ', cost_ij # cost_tmp+=cost_ij # error_sum+=error_ij # else: cost_i, error_i= train_model(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) cost_tmp+=cost_i if iter < 6000 and iter %100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) if iter >= 6000 and iter % 100 == 0: # if iter%100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) test_losses=[] test_y=[] test_features=[] for index in test_batch_start: test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size], indices_test_r[index: index + batch_size], testY[index: index + batch_size], testLeftPad_l[index], testRightPad_l[index], testLeftPad_r[index], testRightPad_r[index], testLengths_l[index], testLengths_r[index], normalized_test_length_l[index], normalized_test_length_r[index], mt_test[index: index + batch_size], wm_test[index: index + batch_size]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc = (1-test_score) * 100. if test_acc > nn_max_acc: nn_max_acc = test_acc print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc #now, see the results of svm if use_svm: train_y=[] train_features=[] for index in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n') #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=LinearRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 test_size=len(test_y) for i in range(test_size): if results[i]==test_y[i]: corr_count+=1 if numpy.absolute(results_lr[i]-test_y[i])<0.5: corr_lr+=1 acc=corr_count*1.0/test_size acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_iter=iter if acc_lr> max_acc: max_acc=acc_lr best_iter=iter print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ', max_acc , ' at iter: ', best_iter if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(file_name, input_filename, model_filename, learning_rate=0.001, n_epochs=2000, nkerns=[90, 90], batch_size=1, window_width=2, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=128, max_d_length=128, margin=0.3): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() f = open(file_name, 'w') f.write("model options " + str(model_options) + '\n') #rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/'; rng = numpy.random.RandomState(23455) train_data, _train_Label, train_size, test_data, _test_Label, test_size, vocab_size = load_MCTest_corpus_DPN( 'vocab_table_wenyan.txt', input_filename, input_filename, max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test f.write('train_size : ' + str(train_size)) #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label), # numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4), # numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4), # numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)] # return results, line_control [ train_data_D, train_data_A1, train_Label, train_Length_D, train_Length_D_s, train_Length_A1, train_leftPad_D, train_leftPad_D_s, train_leftPad_A1, train_rightPad_D, train_rightPad_D_s, train_rightPad_A1 ] = train_data [ test_data_D, test_data_A1, test_Label, test_Length_D, test_Length_D_s, test_Length_A1, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1, test_rightPad_D, test_rightPad_D_s, test_rightPad_A1 ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, 'vectors_wenyan2.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() index_A1 = T.lvector() # index_A2= T.lvector() # index_A3= T.lvector() # index_A4= T.lvector() y = T.lscalar() len_D = T.lscalar() len_D_s = T.lvector() # len_Q=T.lscalar() len_A1 = T.lscalar() # len_A2=T.lscalar() # len_A3=T.lscalar() # len_A4=T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() # left_Q=T.lscalar() left_A1 = T.lscalar() # left_A2=T.lscalar() # left_A3=T.lscalar() # left_A4=T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() # right_Q=T.lscalar() right_A1 = T.lscalar() # right_A2=T.lscalar() # right_A3=T.lscalar() # right_A4=T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### f.write('... building the model\n') # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape( (batch_size, maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b = create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para = [conv_W, conv_b] conv2_W, conv2_b = create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para = [conv2_W, conv2_b] high_W, high_b = create_highw_para( rng, nkerns[0], nkerns[1] ) # this part decides nkern[0] and nkern[1] must be in the same dimension highW_para = [high_W, high_b] params = layer2_para + layer0_para + highW_para #+[embeddings] #load_model(params) layer0_D = Conv_with_input_para( rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para( rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) #layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') # layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output') #layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output') # layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output') # layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output') # layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA1 = Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, length_D_s=len_D_s + filter_words[1] - 1, length_r=len_A1 + filter_words[1] - 1, dim=maxSentLength + filter_words[1] - 1, doc_len=maxDocLength, topk=1) #layer1_DA2=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A2+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) # layer1_DA3=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A3+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) # layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc # layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA1 = Conv_with_input_para( rng, input=layer1_DA1.output_D.reshape( (batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #layer2_DA2 = Conv_with_input_para(rng, input=layer1_DA2.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_DA3 = Conv_with_input_para(rng, input=layer1_DA3.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights # layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1 = Conv_with_input_para_one_col_featuremap( rng, input=layer1_DA1.output_QA_sent_level_rep.reshape( (batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A1_output_sent_rep_Dlevel = debug_print( layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel') # layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel') # layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel') # layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel') # layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA1 = Average_Pooling_for_Top( rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D + filter_sents[1] - 1, length_r=1, dim=maxDocLength + filter_sents[1] - 1, topk=1) #layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) # layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) # layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) #high-way # transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1') transform_gate_A1 = debug_print( T.nnet.sigmoid( T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1') # transform_gate_A2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2') # transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3') # transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4') # overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A1 = ( 1.0 - transform_gate_DA1 ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep # overall_D_A2=(1.0-transform_gate_DA2)*layer1_DA2.output_D_sent_level_rep+transform_gate_DA2*layer3_DA2.output_D_doc_level_rep # overall_D_A3=(1.0-transform_gate_DA3)*layer1_DA3.output_D_sent_level_rep+transform_gate_DA3*layer3_DA3.output_D_doc_level_rep # overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep # overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel overall_A1 = ( 1.0 - transform_gate_A1 ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel #overall_A2=(1.0-transform_gate_A2)*layer1_DA2.output_QA_sent_level_rep+transform_gate_A2*layer2_A2.output_sent_rep_Dlevel # overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel # overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel simi_sent_level1 = debug_print( cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1') #simi_sent_level2=debug_print(cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2') # simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3') # simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4') simi_doc_level1 = debug_print( cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1') #simi_doc_level2=debug_print(cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2') # simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3') # simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4') simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1') #simi_overall_level2=debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4') # simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1 # simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2 simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0 #simi_1 = simi_doc_level1 #simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0 # simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0 # simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0 logistic_w, logistic_b = create_logistic_para(rng, 1, 2) logistic_para = [logistic_w, logistic_b] sent_w, sent_b = create_logistic_para(rng, 1, 2) doc_w, doc_b = create_logistic_para(rng, 1, 2) sent_para = [sent_w, sent_b] doc_para = [doc_w, doc_b] params += logistic_para params += sent_para params += doc_para load_model(params, model_filename) simi_sent = T.dot(sent_w, simi_sent_level1) + sent_b.dimshuffle(0, 'x') simi_sent = simi_sent.dimshuffle(1, 0) simi_sent = T.nnet.softmax(simi_sent) tmp_sent = T.log(simi_sent) simi_doc = T.dot(doc_w, simi_doc_level1) + doc_b.dimshuffle(0, 'x') simi_doc = simi_doc.dimshuffle(1, 0) simi_doc = T.nnet.softmax(simi_doc) tmp_doc = T.log(simi_doc) #cost = margin - simi_1 simi_overall = T.dot(logistic_w, simi_overall_level1) + logistic_b.dimshuffle(0, 'x') simi_overall = simi_overall.dimshuffle(1, 0) simi_overall = T.nnet.softmax(simi_overall) predict = T.argmax(simi_overall, axis=1) tmp_overall = T.log(simi_overall) cost = -(tmp_overall[0][y] + tmp_doc[0][y] + tmp_sent[0][y]) / 3.0 L2_reg = (conv2_W**2).sum() + (conv_W**2).sum() + (logistic_w**2).sum() + ( high_W**2).sum() cost = cost + L2_weight * L2_reg #simi_1 = [simi_overall,simi_doc,simi_sent] # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) # #only use overall_simi # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_overall_level1 # nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4]) #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # cost=T.maximum(0.0, margin+simi_2-simi_1) #cost=T.maximum(0.0, margin+simi_sent_level2-simi_sent_level1)+T.maximum(0.0, margin+simi_doc_level2-simi_doc_level1)+T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1) # posi_simi=simi_1 # nega_simi=simi_2 #L2_reg =debug_print((high_W**2).sum()+(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() #cost=debug_print(cost+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [cost, simi_overall, simi_doc, simi_sent, predict], givens={ index_D: test_data_D[index], #a matrix # index_Q: test_data_Q[index], index_A1: test_data_A1[index], y: test_Label[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], # len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], # len_A2: test_Length_A2[index], # len_A3: test_Length_A3[index], # len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], # left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], # left_A2: test_leftPad_A2[index], # left_A3: test_leftPad_A3[index], # left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], # right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], }, on_unused_input='ignore') accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i = debug_print(grad_i, 'grad_i') acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # acc = acc_i + T.sqr(grad_i) # if param_i == embeddings: # updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size))))) #AdaGrad # else: # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) train_model = theano.function( [index], [cost, simi_overall, simi_doc, simi_sent, predict], updates=updates, givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], # index_A2: train_data_A2[index], # index_A3: train_data_A3[index], # index_A4: train_data_A4[index], y: train_Label[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], # len_A2: train_Length_A2[index], # len_A3: train_Length_A3[index], # len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], # left_A2: train_leftPad_A2[index], # left_A3: train_leftPad_A3[index], # left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], # right_A2: train_rightPad_A2[index] # right_A3: train_rightPad_A3[index], # right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### f.write('... training\n') # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch cost, simi_overall, simi_doc, simi_sent, predict = test_model(0) cost, simi_overall1, simi_doc, simi_sent, predict = test_model(1) cost, simi_overall2, simi_doc, simi_sent, predict = test_model(2) cost, simi_overall3, simi_doc, simi_sent, predict = test_model(3) return simi_overall, simi_overall1, simi_overall2, simi_overall3 '''
def evaluate_lenet5(learning_rate=0.1, n_epochs=4, L2_weight=0.001, emb_size=70, batch_size=50, filter_size=3, maxSentLen=50, nn='CNN'): hidden_size=emb_size model_options = locals().copy() print "model options", model_options rng = np.random.RandomState(1234) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r,all_labels, word2id =load_SNLI_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_sents_l=np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') test_sents_l=np.asarray(all_sentences_l[2], dtype='int32') train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r=np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') test_sents_r=np.asarray(all_sentences_r[2] , dtype='int32') train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store=np.asarray(all_labels[0], dtype='int32') dev_labels_store=np.asarray(all_labels[1], dtype='int32') test_labels_store=np.asarray(all_labels[2], dtype='int32') train_size=len(train_labels_store) dev_size=len(dev_labels_store) test_size=len(test_labels_store) vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input_l=embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_r=embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)) #conv if nn=='CNN': conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size)) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) NN_para=[conv_W, conv_b] conv_input_l = common_input_l.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_l = Conv_with_input_para(rng, input=conv_input_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_l=conv_model_l.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_l=conv_output_l.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_l=T.repeat(sents_mask_l[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_l=(1.0-mask_for_conv_output_l)*(mask_for_conv_output_l-10) masked_conv_output_l=conv_output_into_tensor3_l+mask_for_conv_output_l #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_l=T.max(masked_conv_output_l, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_input_r = common_input_r.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_r = Conv_with_input_para(rng, input=conv_input_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_r=conv_model_r.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_r=conv_output_r.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_r=T.repeat(sents_mask_r[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_r=(1.0-mask_for_conv_output_r)*(mask_for_conv_output_r-10) masked_conv_output_r=conv_output_into_tensor3_r+mask_for_conv_output_r #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_r=T.max(masked_conv_output_r, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size #GRU if nn=='GRU': U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) NN_para=[U1, W1, b1] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias gru_input_l = common_input_l.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_l=GRU_Batch_Tensor_Input_with_Mask(gru_input_l, sents_mask_l, hidden_size, U1, W1, b1) sent_embeddings_l=gru_layer_l.output_sent_rep # (batch_size, hidden_size) gru_input_r = common_input_r.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_r=GRU_Batch_Tensor_Input_with_Mask(gru_input_r, sents_mask_r, hidden_size, U1, W1, b1) sent_embeddings_r=gru_layer_r.output_sent_rep # (batch_size, hidden_size) #LSTM if nn=='LSTM': LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) NN_para=LSTM_para_dict.values() # .values returns a list of parameters lstm_input_l = common_input_l.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_l=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_l, sents_mask_l, hidden_size, LSTM_para_dict) sent_embeddings_l=lstm_layer_l.output_sent_rep # (batch_size, hidden_size) lstm_input_r = common_input_r.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_r=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_r, sents_mask_r, hidden_size, LSTM_para_dict) sent_embeddings_r=lstm_layer_r.output_sent_rep # (batch_size, hidden_size) HL_layer_1_input = T.concatenate([sent_embeddings_l,sent_embeddings_r, sent_embeddings_l*sent_embeddings_r, cosine_matrix1_matrix2_rowwise(sent_embeddings_l,sent_embeddings_r).dimshuffle(0,'x')],axis=1) HL_layer_1_input_size = hidden_size*3+1 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size, n_out=hidden_size, activation=T.tanh) #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative LR_input_size=HL_layer_1_input_size+2*hidden_size U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] LR_input=T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1) layer_LR=LogisticRegression(rng, input=T.tanh(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [embeddings]+NN_para+LR_para+HL_layer_1.params+HL_layer_2.params # put all model parameters together # L2_reg =L2norm_paraList([embeddings,conv_W, U_a]) # diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix) cost=loss#+Div_reg*diversify_reg#+L2_weight*L2_reg grads = T.grad(cost, params) # create a list of gradients for all model parameters accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #1e-8 is add to get rid of zero division updates.append((acc_i, acc)) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_train_batches=train_size/batch_size train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_dev_batches=dev_size/batch_size dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches=test_size/batch_size test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_acc_dev=0.0 max_acc_test=0.0 while epoch < n_epochs: epoch = epoch + 1 train_indices = range(train_size) random.Random(200).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu=0 cost_i=0.0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_indices[batch_id:batch_id+batch_size] cost_i+= train_model( train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter%500==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0: # print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' # past_time = time.time() error_sum=0.0 for dev_batch_id in dev_batch_start: # for each test batch error_i=dev_model( dev_sents_l[dev_batch_id:dev_batch_id+batch_size], dev_masks_l[dev_batch_id:dev_batch_id+batch_size], dev_sents_r[dev_batch_id:dev_batch_id+batch_size], dev_masks_r[dev_batch_id:dev_batch_id+batch_size], dev_labels_store[dev_batch_id:dev_batch_id+batch_size] ) error_sum+=error_i dev_accuracy=1.0-error_sum/(len(dev_batch_start)) if dev_accuracy > max_acc_dev: max_acc_dev=dev_accuracy print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev #best dev model, do test error_sum=0.0 for test_batch_id in test_batch_start: # for each test batch error_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_labels_store[test_batch_id:test_batch_id+batch_size] ) error_sum+=error_i test_accuracy=1.0-error_sum/(len(test_batch_start)) if test_accuracy > max_acc_test: max_acc_test=test_accuracy print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test else: print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test