def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=50, char_nkerns=4, batch_size=1, window_width=[2, 5], emb_size=50, char_emb_size=4, hidden_size=200, margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=21, neg_all=100, train_size=200, test_size=200, mark='_forfun'): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/' triple_files=['annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.txt'] rng = numpy.random.RandomState(23455) datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_train(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size, mark)#max_char_len, max_des_len, max_relation_len, max_Q_len print 'vocab_size:', vocab_size, 'char_size:', char_size train_data=datasets # valid_data=datasets[1] test_data=datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # train_pos_entity_char=train_data[0] train_pos_entity_des=train_data[1] train_relations=train_data[2] train_entity_char_lengths=train_data[3] train_entity_des_lengths=train_data[4] train_relation_lengths=train_data[5] train_mention_char_ids=train_data[6] train_remainQ_word_ids=train_data[7] train_mention_char_lens=train_data[8] train_remainQ_word_len=train_data[9] train_entity_scores=train_data[10] test_pos_entity_char=test_data[0] test_pos_entity_des=test_data[1] test_relations=test_data[2] test_entity_char_lengths=test_data[3] test_entity_des_lengths=test_data[4] test_relation_lengths=test_data[5] test_mention_char_ids=test_data[6] test_remainQ_word_ids=test_data[7] test_mention_char_lens=test_data[8] test_remainQ_word_len=test_data[9] test_entity_scores=test_data[10] # # test_pos_entity_char=test_data[0] #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51 # test_pos_entity_des=test_data[1] #matrix, each row for a examle: 20*2*51 # test_relations=test_data[2] #matrix, each row for a example: 5*51 # test_entity_char_lengths=test_data[3] #matrix, each row for a example: 3*2*51 (three valies for one entity) # test_entity_des_lengths=test_data[4] #matrix, each row for a example: 3*2*51 (three values for one entity) # test_relation_lengths=test_data[5] #matrix, each row for a example: 3*51 # test_mention_char_ids=test_data[6] #matrix, each row for a mention: 40 # test_remainQ_word_ids=test_data[7] #matrix, each row for a question: 30 # test_mention_char_lens=test_data[8] #matrix, each three values for a mention: 3 # test_remainQ_word_len=test_data[9] #matrix, each three values for a remain question: 3 train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\ len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)] if sum(train_sizes)/len(train_sizes)!=train_size: print 'weird size:', train_sizes exit(0) test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\ len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)] if sum(test_sizes)/len(test_sizes)!=test_size: print 'weird size:', test_sizes exit(0) n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char) indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des) indices_train_relations=pythonList_into_theanoIntMatrix(train_relations) indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths) indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths) indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths) indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids) indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids) indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens) indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len) indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) # indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char) # indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des) # indices_test_relations=pythonList_into_theanoIntMatrix(test_relations) # indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths) # indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths) # indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths) # indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids) # indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids) # indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens) # indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len) # indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores) rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt') embeddings=theano.shared(value=rand_values, borrow=True) char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234)) char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data index = T.lscalar() chosed_indices=T.lvector() ent_char_ids_M = T.lmatrix() ent_lens_M = T.lmatrix() men_char_ids_M = T.lmatrix() men_lens_M=T.lmatrix() rel_word_ids_M=T.lmatrix() rel_word_lens_M=T.lmatrix() desH_word_ids_M=T.lmatrix() desH_word_lens_M=T.lmatrix() # desT_word_ids_M=T.lmatrix() # desT_word_lens_M=T.lmatrix() q_word_ids_M=T.lmatrix() q_word_lens_M=T.lmatrix() ent_scores=T.dvector() #max_char_len, max_des_len, max_relation_len, max_Q_len # ent_men_ishape = (char_emb_size, max_char_len) # this is the size of MNIST images # rel_ishape=(emb_size, max_relation_len) # des_ishape=(emb_size, max_des_len) # q_ishape=(emb_size, max_Q_len) filter_size=(emb_size,window_width[0]) char_filter_size=(char_emb_size, window_width[1]) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1]) word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1]) char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape) q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b] char_conv_W_into_matrix=char_conv_W.reshape((char_conv_W.shape[0], char_conv_W.shape[2]*char_conv_W.shape[3])) q_rel_conv_W_into_matrix=q_rel_conv_W.reshape((q_rel_conv_W.shape[0], q_rel_conv_W.shape[2]*q_rel_conv_W.shape[3])) q_desH_conv_W_into_matrix=q_desH_conv_W.reshape((q_desH_conv_W.shape[0], q_desH_conv_W.shape[2]*q_desH_conv_W.shape[3])) # load_model_from_file(rootPath, params, '') def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f,desH_word_ids_f, desH_word_lens_f, men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH q_desH_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) desH_conv = Conv_with_input_para(rng, input=desH_word_input, image_shape=(batch_size, 1, emb_size, max_des_len), filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) # #q_desT # q_desT_conv = Conv_with_input_para(rng, input=q_word_input, # image_shape=(batch_size, 1, emb_size, max_Q_len), # filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) # desT_conv = Conv_with_input_para(rng, input=desT_word_input, # image_shape=(batch_size, 1, emb_size, max_des_len), # filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b) # ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output') # men_char_output=debug_print(men_char_conv.output, 'men_char.output') ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2]) # q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, dim=max_Q_len+filter_size[1]-1, topk=2) q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) # q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2]) # desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2]) overall_simi=(cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\ cosine(q_rel_pool.topk_max_pooling, rel_conv_pool.output_maxpooling)+\ 0.1*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling))/3.0 # cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan( SimpleQ_matches_Triple, sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M,desH_word_ids_M, desH_word_lens_M, men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M]) simi_list+=0.5*ent_scores posi_simi=simi_list[0] nega_simies=simi_list[1:] loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) loss_simi=T.mean(loss_simi_list) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((char_embeddings** 2).sum()+(embeddings** 2).sum()+(char_conv_W** 2).sum()+(q_rel_conv_W** 2).sum()+(q_desH_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg= Diversify_Reg(char_conv_W_into_matrix)+Diversify_Reg(q_rel_conv_W_into_matrix)+Diversify_Reg(q_desH_conv_W_into_matrix) cost=loss_simi+L2_weight*L2_reg+Div_reg*diversify_reg #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, desH_word_ids_M, desH_word_lens_M, q_word_ids_M, q_word_lens_M, ent_scores], [loss_simi, simi_list],on_unused_input='ignore') # givens={ # ent_char_ids_M : test_pos_entity_char[index].reshape((length_per_example_test[index], max_char_len)), # ent_lens_M : test_entity_char_lengths[index].reshape((length_per_example_test[index], 3)), # men_char_ids_M : test_mention_char_ids[index].reshape((length_per_example_test[index], max_char_len)), # men_lens_M : test_mention_char_lens[index].reshape((length_per_example_test[index], 3)), # rel_word_ids_M : test_relations[index].reshape((length_per_example_test[index], max_relation_len)), # rel_word_lens_M : test_relation_lengths[index].reshape((length_per_example_test[index], 3)), # desH_word_ids_M : test_pos_entity_des[index].reshape((length_per_example_test[index], max_des_len)), # desH_word_lens_M : test_entity_des_lengths[index].reshape((length_per_example_test[index], 3)), # # desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], # # desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2], # q_word_ids_M : test_remainQ_word_ids[index].reshape((length_per_example_test[index], max_Q_len)), # q_word_lens_M : test_remainQ_word_len[index].reshape((length_per_example_test[index], 3)), # ent_scores : test_entity_scores[index]}, #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] #+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad # updates.append((acc_i, acc)) if param_i == embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size))))) #Ada elif param_i == char_embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(char_emb_size))))) #AdaGrad else: updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates, givens={ ent_char_ids_M : indices_train_pos_entity_char[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)), ent_lens_M : indices_train_entity_char_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), men_char_ids_M : indices_train_mention_char_ids[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)), men_lens_M : indices_train_mention_char_lens[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)), rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), desH_word_ids_M : indices_train_pos_entity_des[index].reshape((neg_all, max_des_len))[chosed_indices].reshape((train_neg_size, max_des_len)), desH_word_lens_M : indices_train_entity_des_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), # desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], # desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2], q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), ent_scores : indices_train_entity_scores[index][chosed_indices] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False best_test_accu=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #print batch_start sample_indices=[0]+random.sample(range(1, neg_all), train_neg_size-1) loss_simi_i, cost_i= train_model(batch_start, sample_indices) # if batch_start%1==0: # print batch_start, '\t loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i # store_model_to_file(rootPath, params) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+'\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i #if iter ==1: # exit(0) # if iter % n_train_batches == 0: test_loss=[] succ=0 for i in range(test_size): # print 'testing', i, '...' #prepare data test_ent_char_ids_M= numpy.asarray(test_pos_entity_char[i], dtype='int64').reshape((length_per_example_test[i], max_char_len)) test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int64').reshape((length_per_example_test[i], max_char_len)) test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int64').reshape((length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int64').reshape((length_per_example_test[i], max_des_len)) test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int64').reshape((length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int64').reshape((length_per_example_test[i], 3)) test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX) loss_simi_i,simi_list_i=test_model(test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M, test_desH_word_ids_M, test_desH_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if simi_list_i[0]>=max(simi_list_i[1:]): succ+=1 # print 'testing', i, '...acc:', succ*1.0/(i+1) succ=succ*1.0/test_size #now, check MAP and MRR print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best ' 'model %f') % (epoch, minibatch_index, n_train_batches,succ)) if best_test_accu< succ: best_test_accu=succ store_model_to_file(rootPath, params, mark) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, word_nkerns=500, char_nkerns=100, batch_size=1, window_width=3, emb_size=500, char_emb_size=100, hidden_size=200, margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40, max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=6, neg_all=100, train_size=75893, test_size=19168, mark='_BiasedMaxPool_lr0.1_word500_char100_noDes_ent2.0'): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/' triple_files=['annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.fromMo_FB5M.txt'] rng = numpy.random.RandomState(23455) word2id, char2id=load_word2id_char2id(mark) # datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_test_or_valid(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size)#max_char_len, max_des_len, max_relation_len, max_Q_len datasets_test, length_per_example_test, word2id, char2id = load_test_or_valid(triple_files[1], char2id, word2id, max_char_len, max_des_len, max_relation_len, max_Q_len, test_size) vocab_size=len(word2id) char_size=len(char2id) print 'vocab_size:', vocab_size, 'char_size:', char_size # train_data=datasets # valid_data=datasets[1] test_data=datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # # train_pos_entity_char=train_data[0] # train_pos_entity_des=train_data[1] # train_relations=train_data[2] # train_entity_char_lengths=train_data[3] # train_entity_des_lengths=train_data[4] # train_relation_lengths=train_data[5] # train_mention_char_ids=train_data[6] # train_remainQ_word_ids=train_data[7] # train_mention_char_lens=train_data[8] # train_remainQ_word_len=train_data[9] # train_entity_scores=train_data[10] test_pos_entity_char=test_data[0] # test_pos_entity_des=test_data[1] test_relations=test_data[2] test_entity_char_lengths=test_data[3] # test_entity_des_lengths=test_data[4] test_relation_lengths=test_data[5] test_mention_char_ids=test_data[6] test_remainQ_word_ids=test_data[7] test_mention_char_lens=test_data[8] test_remainQ_word_len=test_data[9] test_entity_scores=test_data[10] # # test_pos_entity_char=test_data[0] #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51 # test_pos_entity_des=test_data[1] #matrix, each row for a examle: 20*2*51 # test_relations=test_data[2] #matrix, each row for a example: 5*51 # test_entity_char_lengths=test_data[3] #matrix, each row for a example: 3*2*51 (three valies for one entity) # test_entity_des_lengths=test_data[4] #matrix, each row for a example: 3*2*51 (three values for one entity) # test_relation_lengths=test_data[5] #matrix, each row for a example: 3*51 # test_mention_char_ids=test_data[6] #matrix, each row for a mention: 40 # test_remainQ_word_ids=test_data[7] #matrix, each row for a question: 30 # test_mention_char_lens=test_data[8] #matrix, each three values for a mention: 3 # test_remainQ_word_len=test_data[9] #matrix, each three values for a remain question: 3 # train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\ # len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)] # if sum(train_sizes)/len(train_sizes)!=train_size: # print 'weird size:', train_sizes # exit(0) test_sizes=[len(test_pos_entity_char), len(test_relations), len(test_entity_char_lengths),\ len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)] if sum(test_sizes)/len(test_sizes)!=test_size: print 'weird size:', test_sizes exit(0) # n_train_batches=train_size/batch_size # n_test_batches=test_size/batch_size # train_batch_start=list(numpy.arange(n_train_batches)*batch_size) # test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char) # indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des) # indices_train_relations=pythonList_into_theanoIntMatrix(train_relations) # indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths) # indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths) # indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths) # indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids) # indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids) # indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens) # indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len) # indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) # indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char) # indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des) # indices_test_relations=pythonList_into_theanoIntMatrix(test_relations) # indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths) # indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths) # indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths) # indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids) # indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids) # indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens) # indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len) # indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores) rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) # rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb.txt') embeddings=theano.shared(value=rand_values, borrow=True) char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data index = T.iscalar() chosed_indices=T.ivector() ent_char_ids_M = T.imatrix() ent_lens_M = T.imatrix() men_char_ids_M = T.imatrix() men_lens_M=T.imatrix() rel_word_ids_M=T.imatrix() rel_word_lens_M=T.imatrix() #desH_word_ids_M=T.imatrix() #desH_word_lens_M=T.imatrix() q_word_ids_M=T.imatrix() q_word_lens_M=T.imatrix() ent_scores=T.fvector() filter_size=(emb_size,window_width) char_filter_size=(char_emb_size, window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1]) word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1]) char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape) q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) #q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b]#, q_desH_conv_W, q_desH_conv_b] load_model_from_file(rootPath, params, mark) def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f, men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f): # rng = numpy.random.RandomState(23455) ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #ent_mention ent_char_conv = Conv_with_input_para(rng, input=ent_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) men_char_conv = Conv_with_input_para(rng, input=men_char_input, image_shape=(batch_size, 1, char_emb_size, max_char_len), filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) #q_desH #q_desH_conv = Conv_with_input_para(rng, input=q_word_input, # image_shape=(batch_size, 1, emb_size, max_Q_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) #desH_conv = Conv_with_input_para(rng, input=desH_word_input, # image_shape=(batch_size, 1, emb_size, max_des_len), # filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b) ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2]) men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2]) #q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, dim=max_Q_len+filter_size[1]-1, topk=2) #q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) #desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2]) overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)*0.33333+\ cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)*0.55 # 0.0*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling) # cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan( SimpleQ_matches_Triple, sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M, men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M]) simi_list+=0.2*ent_scores posi_simi=simi_list[0] nega_simies=simi_list[1:] loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) loss_simi=T.sum(loss_simi_list) test_model = theano.function([ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, q_word_ids_M, q_word_lens_M, ent_scores], [loss_simi, simi_list],on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... testing' start_time = time.clock() mid_time = start_time epoch = 0 test_loss=[] succ=0 for i in range(test_size): #prepare data test_ent_char_ids_M= numpy.asarray(test_pos_entity_char[i], dtype='int32').reshape((length_per_example_test[i], max_char_len)) test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int32').reshape((length_per_example_test[i], max_char_len)) test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int32').reshape((length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) #test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int32').reshape((length_per_example_test[i], max_des_len)) #test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int32').reshape((length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX) loss_simi_i,simi_list_i=test_model(test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if len(simi_list_i)==1 or simi_list_i[0]>=max(simi_list_i[1:]): succ+=1 if i%1000==0: print 'testing', i, '...acc:', (succ*1.0/(i+1))*(19168*1.0/21687) succ=succ*100.0/21687 #now, check MAP and MRR print 'accu:', succ # store_model_to_file(rootPath, params, succ, mark) print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[1,1], batch_size=1, window_width=3, maxSentLength=60, emb_size=300, L2_weight=0.0005, update_freq=1, unifiedWidth_conv0=8, k_dy=3, ktop=3): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_msr_corpus(rootPath+'vocab.txt', rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' #mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size)) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l=T.lscalar() right_l=T.lscalar() left_r=T.lscalar() right_r=T.lscalar() length_l=T.lscalar() length_r=T.lscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() #mts=T.dmatrix() #wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv0=ishape[1]+filter_size[1]-1 poolsize1=(1, length_after_wideConv0) length_after_wideConv1=unifiedWidth_conv0+filter_size[1]-1 poolsize2=(1, length_after_wideConv1) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_l, right=right_l, W=conv_W, b=conv_b, firstLayer=True) layer0_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_r, right=right_r, W=conv_W, b=conv_b, firstLayer=True) layer0_l_output=debug_print(layer0_ll.fold_output, 'layer0_l.output') layer0_r_output=debug_print(layer0_rr.fold_output, 'layer0_r.output') layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=ishape[0]/2, left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) conv_W2, conv_b2=create_conv_para(rng, filter_shape=(1, 1, filter_size[0]/2, filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer1_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_ll.output, image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0), filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_ll.leftPad, right=layer0_ll.rightPad, W=conv_W2, b=conv_b2, firstLayer=False) layer1_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_rr.output, image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0), filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_rr.leftPad, right=layer0_rr.rightPad, W=conv_W2, b=conv_b2, firstLayer=False) layer1_l_output=debug_print(layer1_ll.fold_output, 'layer1_l.output') layer1_r_output=debug_print(layer1_rr.fold_output, 'layer1_r.output') layer2=Average_Pooling_for_Top(rng, input_l=layer1_l_output, input_r=layer1_r_output, kern=ishape[0]/4, left_l=layer0_ll.leftPad, right_l=layer0_ll.rightPad, left_r=layer0_rr.leftPad, right_r=layer0_rr.rightPad, length_l=k_dy+filter_size[1]-1, length_r=k_dy+filter_size[1]-1, dim=unifiedWidth_conv0+filter_size[1]-1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input=T.concatenate([#mts, eucli_1, uni_cosine, #norm_uni_l, norm_uni_r,#uni_cosine,#norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # layer1.output_eucli_to_simi,layer1.output_cosine, layer1.output_attentions, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # #layer1.output_vector_l,layer1.output_vector_r, layer2.output_eucli_to_simi,layer2.output_cosine, layer2.output_attentions, #layer2.output_vector_l,layer2.output_vector_r, len_l, len_r #layer1.output_attentions, #wmf, ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3=LogisticRegression(rng, input=layer3_input, n_in=(2)+(2+4*4)+(2+4*4)+2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum()+(conv_W2**2).sum(), 'L2_reg')#+(layer1.W** 2).sum() cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') test_model = theano.function([index], [layer3.errors(y), layer3.y_pred, layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index] #mts: mt_test[index: index + batch_size], #wmf: wm_test[index: index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ [conv_W]+[conv_W2]# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): #grad_i=debug_print(grad_i,'grad_i') #norm=T.sqrt((grad_i**2).sum()) #if T.lt(norm_threshold, norm): # print 'big norm' # grad_i=grad_i*(norm_threshold/norm) acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,cost_tmp], [cost,layer3.errors(y), layer3_input], updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index] #mts: mt_train[index: index + batch_size], #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index] #mts: mt_train[index: index + batch_size], #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'cost_ij: ', cost_ij cost_tmp+=cost_ij error_sum+=error_ij else: cost_average, error_ij, layer3_input= train_model(batch_start,cost_tmp) #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq) error_sum=0 cost_tmp=0.0#reset for the next batch #print layer3_input #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] test_y=[] test_features=[] for i in test_batch_start: test_loss, pred_y, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc=1-test_score print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, (1-test_score) * 100.)) #now, see the results of svm #write_feature=open('feature_check.txt', 'w') train_y=[] train_features=[] for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n') #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=linear_model.LogisticRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 test_size=len(test_y) for i in range(test_size): if results[i]==test_y[i]: corr_count+=1 if numpy.absolute(results_lr[i]-test_y[i])<0.5: corr_lr+=1 acc=corr_count*1.0/test_size acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_epoch=epoch if acc_lr> max_acc: max_acc=acc_lr best_epoch=epoch if test_acc> max_acc: max_acc=test_acc best_epoch=epoch print '\t\t\t\t\t\t\t\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ', max_acc , ' at epoch: ', best_epoch #exit(0) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.001, n_epochs=2000, nkerns=[90,90], batch_size=1, window_width=2, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=0.2): maxSentLength=max_s_length+2*(window_width-1) maxDocLength=max_d_length+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/'; rng = numpy.random.RandomState(23455) train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DPNQ(rootPath+'vocab_DPNQ.txt', rootPath+'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', rootPath+'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label), # numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4), # numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4), # numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)] # return results, line_control [train_data_D, train_data_A1, train_data_A2, train_data_A3, train_Label, train_Length_D,train_Length_D_s, train_Length_A1, train_Length_A2, train_Length_A3, train_leftPad_D,train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_rightPad_D,train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3]=train_data [test_data_D, test_data_A1, test_data_A2, test_data_A3, test_Label, test_Length_D,test_Length_D_s, test_Length_A1, test_Length_A2, test_Length_A3, test_leftPad_D,test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_rightPad_D,test_rightPad_D_s, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3]=test_data n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DPNQ_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() index_A1= T.lvector() index_A2= T.lvector() index_A3= T.lvector() # index_A4= T.lvector() # y = T.lvector() len_D=T.lscalar() len_D_s=T.lvector() # len_Q=T.lscalar() len_A1=T.lscalar() len_A2=T.lscalar() len_A3=T.lscalar() # len_A4=T.lscalar() left_D=T.lscalar() left_D_s=T.lvector() # left_Q=T.lscalar() left_A1=T.lscalar() left_A2=T.lscalar() left_A3=T.lscalar() # left_A4=T.lscalar() right_D=T.lscalar() right_D_s=T.lvector() # right_Q=T.lscalar() right_A1=T.lscalar() right_A2=T.lscalar() right_A3=T.lscalar() # right_A4=T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words=(emb_size,window_width) filter_sents=(nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) # layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para=[conv_W, conv_b] conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para=[conv2_W, conv2_b] high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # this part decides nkern[0] and nkern[1] must be in the same dimension highW_para=[high_W, high_b] params = layer2_para+layer0_para+highW_para#+[embeddings] #load_model(params) layer0_D = Conv_with_input_para(rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para(rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) # layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input, # image_shape=(batch_size, 1, ishape[0], ishape[1]), # filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output') # layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A1_output=debug_print(layer0_A1.output, 'layer0_A1.output') layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output') layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output') # layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output') # layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA1=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, length_D_s=len_D_s+filter_words[1]-1, length_r=len_A1+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA2=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2, length_D_s=len_D_s+filter_words[1]-1, length_r=len_A2+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA3=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3, length_D_s=len_D_s+filter_words[1]-1, length_r=len_A3+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) # layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0], # left_D=left_D, right_D=right_D, # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4, # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1, # dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc # layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA1 = Conv_with_input_para(rng, input=layer1_DA1.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA2 = Conv_with_input_para(rng, input=layer1_DA2.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA3 = Conv_with_input_para(rng, input=layer1_DA3.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), # image_shape=(batch_size, 1, nkerns[0], dshape[1]), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights # layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A1 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA1.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), # image_shape=(batch_size, 1, nkerns[0], 1), # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) # layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A1_output_sent_rep_Dlevel=debug_print(layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel') layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel') layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel') # layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel') # layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA1=Average_Pooling_for_Top(rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) # layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1], # left_l=left_D, right_l=right_D, left_r=0, right_r=0, # length_l=len_D+filter_sents[1]-1, length_r=1, # dim=maxDocLength+filter_sents[1]-1, topk=3) #high-way # transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1') transform_gate_DA2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b), 'transform_gate_DA2') transform_gate_DA3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b), 'transform_gate_DA3') # transform_gate_DA4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b), 'transform_gate_DA4') # transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q') transform_gate_A1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1') transform_gate_A2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2') # transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3') # transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4') # overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A1=(1.0-transform_gate_DA1)*layer1_DA1.output_D_sent_level_rep+transform_gate_DA1*layer3_DA1.output_D_doc_level_rep overall_D_A2=(1.0-transform_gate_DA2)*layer1_DA2.output_D_sent_level_rep+transform_gate_DA2*layer3_DA2.output_D_doc_level_rep overall_D_A3=(1.0-transform_gate_DA3)*layer1_DA3.output_D_sent_level_rep+transform_gate_DA3*layer3_DA3.output_D_doc_level_rep # overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep # overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel overall_A1=(1.0-transform_gate_A1)*layer1_DA1.output_QA_sent_level_rep+transform_gate_A1*layer2_A1.output_sent_rep_Dlevel overall_A2=(1.0-transform_gate_A2)*layer1_DA2.output_QA_sent_level_rep+transform_gate_A2*layer2_A2.output_sent_rep_Dlevel # overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel # overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel simi_sent_level1=debug_print(cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1') simi_sent_level2=debug_print(cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2') # simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3') # simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4') simi_doc_level1=debug_print(cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1') simi_doc_level2=debug_print(cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2') # simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3') # simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4') simi_overall_level1=debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1') simi_overall_level2=debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4') # simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1 # simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2 simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0 simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0 # simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0 # simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0 # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) # #only use overall_simi # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_overall_level1 # nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4]) #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # cost=T.maximum(0.0, margin+simi_2-simi_1) simi_PQ=cosine(layer1_DA1.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep) simi_NQ=cosine(layer1_DA2.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep) #bad matching at overall level # simi_PQ=cosine(overall_A1, overall_D_A3) # simi_NQ=cosine(overall_A2, overall_D_A3) match_cost=T.maximum(0.0, margin+simi_NQ-simi_PQ) cost=T.maximum(0.0, margin+simi_sent_level2-simi_sent_level1)+T.maximum(0.0, margin+simi_doc_level2-simi_doc_level1)+T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1) cost=cost#+match_cost # posi_simi=simi_1 # nega_simi=simi_2 L2_reg =debug_print((high_W**2).sum()+3*(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost=debug_print(cost+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2], givens={ index_D: test_data_D[index], #a matrix # index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], # index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], # len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], # len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], # left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], # left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], # right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index] # right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # acc = acc_i + T.sqr(grad_i) # if param_i == embeddings: # updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size))))) #AdaGrad # else: # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) train_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2], updates=updates, givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], # index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], # len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], # left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index] # right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2], givens={ index_D: train_data_D[index], # index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], # index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], # len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], # len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], # left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], # left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], # right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index] # right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data posi_train_sent=[] nega_train_sent=[] posi_train_doc=[] nega_train_doc=[] posi_train_overall=[] nega_train_overall=[] for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) ) sys.stdout.flush() minibatch_index=minibatch_index+1 cost_average, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2= train_model(batch_start) posi_train_sent.append(simi_sent_level1) nega_train_sent.append(simi_sent_level2) posi_train_doc.append(simi_doc_level1) nega_train_doc.append(simi_doc_level2) posi_train_overall.append(simi_overall_level1) nega_train_overall.append(simi_overall_level2) if iter % n_train_batches == 0: corr_train_sent=compute_corr(posi_train_sent, nega_train_sent) corr_train_doc=compute_corr(posi_train_doc, nega_train_doc) corr_train_overall=compute_corr(posi_train_overall, nega_train_overall) print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train_sent*300.0/train_size)+' '+str(corr_train_doc*300.0/train_size)+' '+str(corr_train_overall*300.0/train_size) if iter % validation_frequency == 0: posi_test_sent=[] nega_test_sent=[] posi_test_doc=[] nega_test_doc=[] posi_test_overall=[] nega_test_overall=[] for i in test_batch_start: cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2=test_model(i) posi_test_sent.append(simi_sent_level1) nega_test_sent.append(simi_sent_level2) posi_test_doc.append(simi_doc_level1) nega_test_doc.append(simi_doc_level2) posi_test_overall.append(simi_overall_level1) nega_test_overall.append(simi_overall_level2) corr_test_sent=compute_corr(posi_test_sent, nega_test_sent) corr_test_doc=compute_corr(posi_test_doc, nega_test_doc) corr_test_overall=compute_corr(posi_test_overall, nega_test_overall) #write_file.close() #test_score = numpy.mean(test_losses) test_acc_sent=corr_test_sent*1.0/(test_size/3.0) test_acc_doc=corr_test_doc*1.0/(test_size/3.0) test_acc_overall=corr_test_overall*1.0/(test_size/3.0) #test_acc=1-test_score # print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' # 'model %f %%') % # (epoch, minibatch_index, n_train_batches,test_acc * 100.)) print '\t\t\tepoch', epoch, ', minibatch', minibatch_index, '/', n_train_batches, 'test acc of best model', test_acc_sent*100,test_acc_doc*100,test_acc_overall*100 #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better=False if test_acc_sent > max_acc: max_acc=test_acc_sent best_epoch=epoch find_better=True if test_acc_doc > max_acc: max_acc=test_acc_doc best_epoch=epoch find_better=True if test_acc_overall > max_acc: max_acc=test_acc_overall best_epoch=epoch find_better=True print '\t\t\tmax:', max_acc,'(at',best_epoch,')' if find_better==True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.09, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59): maxSentLength=max_s_length+2*(window_width-1) maxDocLength=max_d_length+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/'; rng = numpy.random.RandomState(23455) train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus(rootPath+'vocab.txt', rootPath+'mc500.train.tsv_standardlized.txt', rootPath+'mc500.test.tsv_standardlized.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' # mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') # extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') # discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') [train_data_D, train_data_Q, train_data_A, train_Y, train_Label, train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A, train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A, train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data [test_data_D, test_data_Q, test_data_A, test_Y, test_Label, test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A, test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A, test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A= T.lvector() y = T.lvector() len_D=T.lscalar() len_D_s=T.lvector() len_Q=T.lscalar() len_A=T.lscalar() left_D=T.lscalar() left_D_s=T.lvector() left_Q=T.lscalar() left_A=T.lscalar() right_D=T.lscalar() right_D_s=T.lvector() right_Q=T.lscalar() right_A=T.lscalar() #wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words=(emb_size,window_width) filter_sents=(nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A_input = embeddings[index_A.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) # load_model_for_conv1([conv_W, conv_b]) layer0_D = Conv_with_input_para(rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A = Conv_with_input_para(rng, input=layer0_A_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output') layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output') layer0_A_output=debug_print(layer0_A.output, 'layer0_A.output') layer0_para=[conv_W, conv_b] layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) layer1_DA=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A_output, kern=nkerns[0], left_D=left_D, right_D=right_D, left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A, right_r=right_A, length_D_s=len_D_s+filter_words[1]-1, length_r=len_A+filter_words[1]-1, dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3) conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0] #conv from sentence to doc layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_DA = Conv_with_input_para(rng, input=layer1_DA.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])), image_shape=(batch_size, 1, nkerns[0], dshape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) #conv single Q and A into doc level with same conv weights layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_A = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)), image_shape=(batch_size, 1, nkerns[0], 1), filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b) layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel') layer2_A_output_sent_rep_Dlevel=debug_print(layer2_A.output_sent_rep_Dlevel, 'layer2_A.output_sent_rep_Dlevel') layer2_para=[conv2_W, conv2_b] layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) layer3_DA=Average_Pooling_for_Top(rng, input_l=layer2_DA.output, input_r=layer2_A_output_sent_rep_Dlevel, kern=nkerns[1], left_l=left_D, right_l=right_D, left_r=0, right_r=0, length_l=len_D+filter_sents[1]-1, length_r=1, dim=maxDocLength+filter_sents[1]-1, topk=3) #high-way high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ') transform_gate_DA=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA.output_D_sent_level_rep) + high_b), 'transform_gate_DA') transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q') transform_gate_A=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA.output_QA_sent_level_rep) + high_b), 'transform_gate_A') highW_para=[high_W, high_b] overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q') overall_D_A=(1.0-transform_gate_DA)*layer1_DA.output_D_sent_level_rep+transform_gate_DA*layer3_DA.output_D_doc_level_rep overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel overall_A=(1.0-transform_gate_A)*layer1_DA.output_QA_sent_level_rep+transform_gate_A*layer2_A.output_sent_rep_Dlevel simi_sent_level=debug_print(cosine(layer1_DQ.output_D_sent_level_rep+layer1_DA.output_D_sent_level_rep, layer1_DQ.output_QA_sent_level_rep+layer1_DA.output_QA_sent_level_rep), 'simi_sent_level') simi_doc_level=debug_print(cosine(layer3_DQ.output_D_doc_level_rep+layer3_DA.output_D_doc_level_rep, layer2_Q.output_sent_rep_Dlevel+layer2_A.output_sent_rep_Dlevel), 'simi_doc_level') simi_overall_level=debug_print(cosine(overall_D_Q+overall_D_A, overall_Q+overall_A), 'simi_overall_level') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) layer4_input=debug_print(T.concatenate([simi_sent_level, simi_doc_level, simi_overall_level ], axis=1), 'layer4_input')#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer4=LogisticRegression(rng, input=layer4_input, n_in=3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer4.W** 2).sum()+(high_W**2).sum()+(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this =debug_print(layer4.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') # # [train_data_D, train_data_Q, train_data_A, train_Y, train_Label, # train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A, # train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A, # train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data # [test_data_D, test_data_Q, test_data_A, test_Y, test_Label, # test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A, # test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A, # test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data # index = T.lscalar() # index_D = T.lmatrix() # now, x is the index matrix, must be integer # index_Q = T.lvector() # index_A= T.lvector() # # y = T.lvector() # len_D=T.lscalar() # len_D_s=T.lvector() # len_Q=T.lscalar() # len_A=T.lscalar() # # left_D=T.lscalar() # left_D_s=T.lvector() # left_Q=T.lscalar() # left_A=T.lscalar() # # right_D=T.lscalar() # right_D_s=T.lvector() # right_Q=T.lscalar() # right_A=T.lscalar() # # # #wmf=T.dmatrix() # cost_tmp=T.dscalar() test_model = theano.function([index], [layer4.errors(y),layer4_input, y, layer4.prop_for_posi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A: test_data_A[index], y: test_Y[index:index+batch_size], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A: test_Length_A[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A: test_leftPad_A[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A: test_rightPad_A[index] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer4.params+layer2_para+layer0_para+highW_para accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A: train_data_A[index], y: train_Y[index:index+batch_size], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A: train_Length_A[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A: train_leftPad_A[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A: train_rightPad_A[index] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer4.errors(y), layer4_input, y], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A: train_data_A[index], y: train_Y[index:index+batch_size], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A: train_Length_A[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A: train_leftPad_A[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A: train_rightPad_A[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 # readfile=open('/mounts/data/proj/wenpeng/Dataset/SICK/train_plus_dev.txt', 'r') # train_pairs=[] # train_y=[] # for line in readfile: # tokens=line.strip().split('\t') # listt=tokens[0]+'\t'+tokens[1] # train_pairs.append(listt) # train_y.append(tokens[2]) # readfile.close() # writefile=open('/mounts/data/proj/wenpeng/Dataset/SICK/weights_fine_tune.txt', 'w') for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 sys.stdout.write( "Training :[%6f] %% complete!\r" % (batch_start*100.0/train_size) ) sys.stdout.flush() minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] test_y=[] test_features=[] test_prop=[] for i in test_batch_start: test_loss, layer3_input, y, posi_prop=test_model(i) test_prop.append(posi_prop[0][0]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() #test_score = numpy.mean(test_losses) test_acc=compute_test_acc(test_y, test_prop) #test_acc=1-test_score print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches,test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.decision_function(test_features) lr=linear_model.LogisticRegression(C=1e5) lr.fit(train_features, train_y) results_lr=lr.decision_function(test_features) acc_svm=compute_test_acc(test_y, results) acc_lr=compute_test_acc(test_y, results_lr) find_better=False if acc_svm > max_acc: max_acc=acc_svm best_epoch=epoch find_better=True if test_acc > max_acc: max_acc=test_acc best_epoch=epoch find_better=True if acc_lr> max_acc: max_acc=acc_lr best_epoch=epoch find_better=True print '\t\t\tsvm:', acc_svm, 'lr:', acc_lr, 'nn:', test_acc, 'max:', max_acc,'(at',best_epoch,')' # if find_better==True: # store_model_to_file(layer2_para, best_epoch) # print 'Finished storing best conv params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5], char_filter_size=3, margin=2.0, max_EM=50.302743615): test_batch_size=batch_size*10 model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) word2id={} char2id={} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len) train_size=len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size=len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices= T.imatrix() #batch, (start, end) for each sample para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask=T.fmatrix() char_q_mask=T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size)) char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len)) char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len)) conv_W_char, conv_b_char=create_conv_para(rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0])) conv_W_2, conv_b_2=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_1_q, conv_b_1_q=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) NN_para=[conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char] input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,batch_size, p_len_limit,q_len_limit, emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size, conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char, para_mask, q_mask, char_p_masks,char_q_masks) test_input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,test_batch_size, test_p_len_limit,q_len_limit, emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size, conv_W_1, conv_b_1,conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char, para_mask, q_mask, char_p_masks,char_q_masks) #(batch, hidden, #(batch, 2*hidden, p_len_limit)) # gram_size = 5*true_p_len-(0+1+2+3+4) HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a=normalize_matrix(U_a) norm_HL_1_para=normalize_matrix(HL_1_para) norm_HL_2_para=normalize_matrix(HL_2_para) norm_HL_3_para=normalize_matrix(HL_3_para) norm_HL_4_para=normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para(rng, 1, hidden_size) end_norm_U_a=normalize_matrix(end_U_a) end_norm_HL_1_para=normalize_matrix(end_HL_1_para) end_norm_HL_2_para=normalize_matrix(end_HL_2_para) end_norm_HL_3_para=normalize_matrix(end_HL_3_para) end_norm_HL_4_para=normalize_matrix(end_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, batch_size,true_p_len) span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len) end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para, end_norm_U_a, batch_size,true_p_len) end_span_scores=T.nnet.softmax(end_span_scores_matrix) #(batch, para_len) loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,0]])) end_loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,1]])) #ranking loss start tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor(index_matrix[T.arange(batch_size), gold_indices[:,0]], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega)) #ranking loss END end_tanh_span_scores_matrix = end_span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor(end_index_matrix[T.arange(batch_size), gold_indices[:,1]], 1.0) end_prob_batch_posi = end_tanh_span_scores_matrix[end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_span_scores_matrix[(1.0-end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle('x',0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean(T.maximum(0.0, margin-end_repeat_posi+end_repeat_nega)) loss = loss_neg_likelihood +end_loss_neg_likelihood+loss_rank+end_loss_rank #test test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para,norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len) mask_test_return=T.argmax(test_span_scores_matrix*para_mask, axis=1) #batch end_test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para,end_norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len) end_mask_test_return=T.argmax(end_test_span_scores_matrix*para_mask, axis=1) #batch params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]+[end_U_a,end_HL_1_para,end_HL_2_para,end_HL_3_para,end_HL_4_para] L2_reg =L2norm_paraList([embeddings,char_embeddings,conv_W_1,conv_W_2,conv_W_1_q, conv_W_2_q, conv_W_char,U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]) #L2_reg = L2norm_paraList(params) cost=loss#+L2_weight*L2_reg accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask, true_p_len], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len], [mask_test_return,end_mask_test_return], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_ids[para_id:para_id+batch_size] cost_i+= train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() pred_dict={} q_amount=0 p1=0 for test_para_id in test_batch_start: batch_predict_ids, batch_predict_end_ids=test_model( test_para_list[test_para_id:test_para_id+test_batch_size], test_Q_list[test_para_id:test_para_id+test_batch_size], test_para_mask[test_para_id:test_para_id+test_batch_size], test_Q_mask[test_para_id:test_para_id+test_batch_size], test_para_char_list[test_para_id:test_para_id+test_batch_size], test_Q_char_list[test_para_id:test_para_id+test_batch_size], test_para_char_mask[test_para_id:test_para_id+test_batch_size], test_Q_char_mask[test_para_id:test_para_id+test_batch_size], test_p_len_limit) test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size] # test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] # q_amount+=test_batch_size q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] q_amount+=test_batch_size for q in range(test_batch_size): #for each question # pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q]) start = batch_predict_ids[q] end = batch_predict_end_ids[q] if end < start: start, end = end, start pred_ans = ' '.join(test_para_wordlist_batch[q][start:end+1]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50], batch_size=1, window_width=4, maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40): maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt') wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l=T.lscalar() right_l=T.lscalar() left_r=T.lscalar() right_r=T.lscalar() length_l=T.lscalar() length_r=T.lscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() mts=T.dmatrix() wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) aver_uni_l=sum_uni_l/layer0_l_input.shape[3] norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) aver_uni_r=sum_uni_r/layer0_r_input.shape[3] norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input=T.concatenate([#mts, uni_cosine,#eucli_1_exp,#uni_sigmoid_simi, #norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # layer1.output_cosine, #layer1.output_eucli_to_simi_exp,#layer1.output_sigmoid_simi,#layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # len_l, len_r,wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3=LogisticRegression(rng, input=layer3_input, n_in=(1)+(1)+2+2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [layer3.prop_for_posi,layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index: index + batch_size], wmf: wm_test[index: index + batch_size]}, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params params_conv = [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size], wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False svm_max=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_probs=[] test_y=[] test_features=[] for i in test_batch_start: prob_i, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_probs.append(prob_i[0][0]) test_y.append(y[0]) test_features.append(layer3_input[0]) MAP, MRR=compute_map_mrr(rootPath+'test_filtered.txt', test_probs) #now, check MAP and MRR print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best ' 'model %f, MRR %f') % (epoch, minibatch_index, n_train_batches,MAP, MRR)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(C=1.0, kernel='linear') clf.fit(train_features, train_y) results_svm=clf.decision_function(test_features) MAP_svm, MRR_svm=compute_map_mrr(rootPath+'test_filtered.txt', results_svm) lr=LinearRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) MAP_lr, MRR_lr=compute_map_mrr(rootPath+'test_filtered.txt', results_lr) print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr if patience <= iter: done_looping = True break #after each epoch, increase the batch_size if epoch%2==1: update_freq=update_freq*1 else: update_freq=update_freq/1 #store the paras after epoch 15 if epoch ==15: store_model_to_file(params_conv) print 'Finished storing best conv params' exit(0) #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.09, n_epochs=2000, nkerns=[50], batch_size=1, window_width=3, maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00065, Div_reg=0.01, update_freq=1, norm_threshold=5.0, max_truncate=33, max_truncate_nonoverlap=24): maxSentLength=max_truncate+2*(window_width-1) maxSentLength_nonoverlap=max_truncate_nonoverlap+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_SICK_corpus(rootPath+'vocab.txt', rootPath+'train_plus_dev.txt', rootPath+'test.txt', max_truncate,maxSentLength, entailment=True)#vocab_size contain train, dev and test datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True) #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt') extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt') discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int64') indices_train_r=T.cast(indices_train_r, 'int64') indices_test_l=T.cast(indices_test_l, 'int64') indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #nonoverlap indices_train_nonoverlap, trainY_nonoverlap, trainLengths_nonoverlap, normalized_train_length_nonoverlap, trainLeftPad_nonoverlap, trainRightPad_nonoverlap= datasets_nonoverlap[0] indices_train_l_nonoverlap=indices_train_nonoverlap[::2,:] indices_train_r_nonoverlap=indices_train_nonoverlap[1::2,:] trainLengths_l_nonoverlap=trainLengths_nonoverlap[::2] trainLengths_r_nonoverlap=trainLengths_nonoverlap[1::2] normalized_train_length_l_nonoverlap=normalized_train_length_nonoverlap[::2] normalized_train_length_r_nonoverlap=normalized_train_length_nonoverlap[1::2] trainLeftPad_l_nonoverlap=trainLeftPad_nonoverlap[::2] trainLeftPad_r_nonoverlap=trainLeftPad_nonoverlap[1::2] trainRightPad_l_nonoverlap=trainRightPad_nonoverlap[::2] trainRightPad_r_nonoverlap=trainRightPad_nonoverlap[1::2] indices_test_nonoverlap, testY_nonoverlap, testLengths_nonoverlap,normalized_test_length_nonoverlap, testLeftPad_nonoverlap, testRightPad_nonoverlap= datasets_nonoverlap[1] indices_test_l_nonoverlap=indices_test_nonoverlap[::2,:] indices_test_r_nonoverlap=indices_test_nonoverlap[1::2,:] testLengths_l_nonoverlap=testLengths_nonoverlap[::2] testLengths_r_nonoverlap=testLengths_nonoverlap[1::2] normalized_test_length_l_nonoverlap=normalized_test_length_nonoverlap[::2] normalized_test_length_r_nonoverlap=normalized_test_length_nonoverlap[1::2] testLeftPad_l_nonoverlap=testLeftPad_nonoverlap[::2] testLeftPad_r_nonoverlap=testLeftPad_nonoverlap[1::2] testRightPad_l_nonoverlap=testRightPad_nonoverlap[::2] testRightPad_r_nonoverlap=testRightPad_nonoverlap[1::2] ''' n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) ''' indices_train_l_nonoverlap=theano.shared(numpy.asarray(indices_train_l_nonoverlap, dtype=theano.config.floatX), borrow=True) indices_train_r_nonoverlap=theano.shared(numpy.asarray(indices_train_r_nonoverlap, dtype=theano.config.floatX), borrow=True) indices_test_l_nonoverlap=theano.shared(numpy.asarray(indices_test_l_nonoverlap, dtype=theano.config.floatX), borrow=True) indices_test_r_nonoverlap=theano.shared(numpy.asarray(indices_test_r_nonoverlap, dtype=theano.config.floatX), borrow=True) indices_train_l_nonoverlap=T.cast(indices_train_l_nonoverlap, 'int64') indices_train_r_nonoverlap=T.cast(indices_train_r_nonoverlap, 'int64') indices_test_l_nonoverlap=T.cast(indices_test_l_nonoverlap, 'int64') indices_test_r_nonoverlap=T.cast(indices_test_r_nonoverlap, 'int64') rand_values_nonoverlap=random_value_normal((vocab_size_nonoverlap+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values_nonoverlap[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values_nonoverlap=load_word2vec_to_init(rand_values_nonoverlap, rootPath+'vocab_nonoverlap_train_plus_dev_in_word2vec_embs_300d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings_nonoverlap=theano.shared(value=rand_values_nonoverlap, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix('x_index_l') # now, x is the index matrix, must be integer x_index_l_nonoverlap = T.lmatrix('x_index_l_nonoverlap') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') x_index_r_nonoverlap = T.lmatrix('x_index_r_nonoverlap') y = T.lvector('y') left_l=T.lscalar() right_l=T.lscalar() left_r=T.lscalar() right_r=T.lscalar() length_l=T.lscalar() length_r=T.lscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() left_l_nonoverlap=T.lscalar() right_l_nonoverlap=T.lscalar() left_r_nonoverlap=T.lscalar() right_r_nonoverlap=T.lscalar() length_l_nonoverlap=T.lscalar() length_r_nonoverlap=T.lscalar() norm_length_l_nonoverlap=T.dscalar() norm_length_r_nonoverlap=T.dscalar() mts=T.dmatrix() extra=T.dmatrix() discri=T.dmatrix() #wmf=T.dmatrix() cost_tmp=T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images ishape_nonoverlap = (emb_size, maxSentLength_nonoverlap) filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? #length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_l_input_nonoverlap = embeddings_nonoverlap[x_index_l_nonoverlap.flatten()].reshape((batch_size,maxSentLength_nonoverlap, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input_nonoverlap = embeddings_nonoverlap[x_index_r_nonoverlap.flatten()].reshape((batch_size,maxSentLength_nonoverlap, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_l_nonoverlap = Conv_with_input_para(rng, input=layer0_l_input_nonoverlap, image_shape=(batch_size, 1, ishape_nonoverlap[0], ishape_nonoverlap[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r_nonoverlap = Conv_with_input_para(rng, input=layer0_r_input_nonoverlap, image_shape=(batch_size, 1, ishape_nonoverlap[0], ishape_nonoverlap[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output_nonoverlap=debug_print(layer0_l_nonoverlap.output, 'layer0_l_nonoverlap.output') layer0_r_output_nonoverlap=debug_print(layer0_r_nonoverlap.output, 'layer0_r_nonoverlap.output') layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) layer1_nonoverlap=Average_Pooling_for_Top(rng, input_l=layer0_l_output_nonoverlap, input_r=layer0_r_output_nonoverlap, kern=nkerns[0], left_l=left_l_nonoverlap, right_l=right_l_nonoverlap, left_r=left_r_nonoverlap, right_r=right_r_nonoverlap, length_l=length_l_nonoverlap+filter_size[1]-1, length_r=length_r_nonoverlap+filter_size[1]-1, dim=maxSentLength_nonoverlap+filter_size[1]-1) #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) aver_uni_l=sum_uni_l/layer0_l_input.shape[3] norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) aver_uni_r=sum_uni_r/layer0_r_input.shape[3] norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') linear=Linear(norm_uni_l, norm_uni_r) poly=Poly(norm_uni_l, norm_uni_r) sigmoid=Sigmoid(norm_uni_l, norm_uni_r) rbf=RBF(norm_uni_l, norm_uni_r) gesd=GESD(norm_uni_l, norm_uni_r) eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts sum_uni_l_nonoverlap=T.sum(layer0_l_input_nonoverlap, axis=3).reshape((1, emb_size)) aver_uni_l_nonoverlap=sum_uni_l_nonoverlap/layer0_l_input_nonoverlap.shape[3] norm_uni_l_nonoverlap=sum_uni_l_nonoverlap/T.sqrt((sum_uni_l_nonoverlap**2).sum()) sum_uni_r_nonoverlap=T.sum(layer0_r_input_nonoverlap, axis=3).reshape((1, emb_size)) aver_uni_r_nonoverlap=sum_uni_r_nonoverlap/layer0_r_input_nonoverlap.shape[3] norm_uni_r_nonoverlap=sum_uni_r_nonoverlap/T.sqrt((sum_uni_r_nonoverlap**2).sum()) uni_cosine_nonoverlap=cosine(sum_uni_l_nonoverlap, sum_uni_r_nonoverlap) aver_uni_cosine_nonoverlap=cosine(aver_uni_l_nonoverlap, aver_uni_r_nonoverlap) uni_sigmoid_simi_nonoverlap=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l_nonoverlap, norm_uni_r_nonoverlap.T)).reshape((1,1)),'uni_sigmoid_simi') eucli_1_nonoverlap=1.0/(1.0+EUCLID(sum_uni_l_nonoverlap, sum_uni_r_nonoverlap))#25.2% #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) len_l_nonoverlap=norm_length_l_nonoverlap.reshape((1,1)) len_r_nonoverlap=norm_length_r_nonoverlap.reshape((1,1)) ''' len_l_nonoverlap=length_l_nonoverlap.reshape((1,1)) len_r_nonoverlap=length_r_nonoverlap.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input=T.concatenate([mts, eucli_1,uni_cosine,#linear, poly,sigmoid,rbf, gesd, #sum_uni_r-sum_uni_l, eucli_1_nonoverlap,uni_cosine_nonoverlap, layer1.output_eucli_to_simi,layer1.output_cosine, #layer1.output_vector_r-layer1.output_vector_l, layer1_nonoverlap.output_eucli_to_simi,layer1_nonoverlap.output_cosine, len_l, len_r, len_l_nonoverlap, len_r_nonoverlap, extra #discri #wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3=LogisticRegression(rng, input=layer3_input, n_in=14+(2*2)+(2*2)+(2*2)+9, n_out=3) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg= Diversify_Reg(layer3.W.T)+Diversify_Reg(conv_W_into_matrix) cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg+Div_reg*diversify_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [layer3.errors(y),layer3_input, y], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], x_index_l_nonoverlap: indices_test_l_nonoverlap[index: index + batch_size], x_index_r_nonoverlap: indices_test_r_nonoverlap[index: index + batch_size], left_l_nonoverlap: testLeftPad_l_nonoverlap[index], right_l_nonoverlap: testRightPad_l_nonoverlap[index], left_r_nonoverlap: testLeftPad_r_nonoverlap[index], right_r_nonoverlap: testRightPad_r_nonoverlap[index], length_l_nonoverlap: testLengths_l_nonoverlap[index], length_r_nonoverlap: testLengths_r_nonoverlap[index], norm_length_l_nonoverlap: normalized_test_length_l_nonoverlap[index], norm_length_r_nonoverlap: normalized_test_length_r_nonoverlap[index], mts: mt_test[index: index + batch_size], extra: extra_test[index: index + batch_size], discri:discri_test[index: index + batch_size] #wmf: wm_test[index: index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params params_conv = [conv_W, conv_b] accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) # def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): # updates = [] # grads = T.grad(cost, params) # i = theano.shared(numpy.float64(0.)) # i_t = i + 1. # fix1 = 1. - (1. - b1)**i_t # fix2 = 1. - (1. - b2)**i_t # lr_t = lr * (T.sqrt(fix2) / fix1) # for p, g in zip(params, grads): # m = theano.shared(p.get_value() * 0.) # v = theano.shared(p.get_value() * 0.) # m_t = (b1 * g) + ((1. - b1) * m) # v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) # g_t = m_t / (T.sqrt(v_t) + e) # p_t = p - (lr_t * g_t) # updates.append((m, m_t)) # updates.append((v, v_t)) # updates.append((p, p_t)) # updates.append((i, i_t)) # return updates # # updates=Adam(cost=cost, params=params, lr=0.0005) train_model = theano.function([index,cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], x_index_l_nonoverlap: indices_train_l_nonoverlap[index: index + batch_size], x_index_r_nonoverlap: indices_train_r_nonoverlap[index: index + batch_size], left_l_nonoverlap: trainLeftPad_l_nonoverlap[index], right_l_nonoverlap: trainRightPad_l_nonoverlap[index], left_r_nonoverlap: trainLeftPad_r_nonoverlap[index], right_r_nonoverlap: trainRightPad_r_nonoverlap[index], length_l_nonoverlap: trainLengths_l_nonoverlap[index], length_r_nonoverlap: trainLengths_r_nonoverlap[index], norm_length_l_nonoverlap: normalized_train_length_l_nonoverlap[index], norm_length_r_nonoverlap: normalized_train_length_r_nonoverlap[index], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], x_index_l_nonoverlap: indices_train_l_nonoverlap[index: index + batch_size], x_index_r_nonoverlap: indices_train_r_nonoverlap[index: index + batch_size], left_l_nonoverlap: trainLeftPad_l_nonoverlap[index], right_l_nonoverlap: trainRightPad_l_nonoverlap[index], left_r_nonoverlap: trainLeftPad_r_nonoverlap[index], right_r_nonoverlap: trainRightPad_r_nonoverlap[index], length_l_nonoverlap: trainLengths_l_nonoverlap[index], length_r_nonoverlap: trainLengths_r_nonoverlap[index], norm_length_l_nonoverlap: normalized_train_length_l_nonoverlap[index], norm_length_r_nonoverlap: normalized_train_length_r_nonoverlap[index], mts: mt_train[index: index + batch_size], extra: extra_train[index: index + batch_size], discri:discri_train[index: index + batch_size] #wmf: wm_train[index: index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 pre_max=-1 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data cost_tmp=0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) #print batch_start if iter%update_freq != 0: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) #print 'layer3_input', layer3_input cost_tmp+=cost_ij error_sum+=error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average= train_model(batch_start,cost_tmp) #print 'layer3_input', layer3_input error_sum=0 cost_tmp=0.0#reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] test_y=[] test_features=[] for i in test_batch_start: test_loss, layer3_input, y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc=1-test_score print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches,test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y=[] train_features=[] count=0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=linear_model.LogisticRegression(C=1e5) lr.fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 corr_neu=0 neu_co=0 corr_ent=0 ent_co=0 corr_contr=0 contr_co=0 test_size=len(test_y) for i in range(test_size): if results_lr[i]==test_y[i]: corr_lr+=1 if test_y[i]==0:#NEUTRAL neu_co+=1 if results[i]==test_y[i]: corr_neu+=1 elif test_y[i]==1:#ENTAILMENT ent_co+=1 if results[i]==test_y[i]: corr_ent+=1 elif test_y[i]==2:#CONTRADICTION contr_co+=1 if results[i]==test_y[i]: corr_contr+=1 #if numpy.absolute(results_lr[i]-test_y[i])<0.5: # corr_lr+=1 corr_count=corr_neu+corr_ent+corr_contr acc=corr_count*1.0/test_size acc_neu=corr_neu*1.0/neu_co acc_ent=corr_ent*1.0/ent_co acc_contr=corr_contr*1.0/contr_co acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_epoch=epoch if test_acc > max_acc: max_acc=test_acc best_epoch=epoch if acc_lr> max_acc: max_acc=acc_lr best_epoch=epoch print '\t\t\tsvm:', acc, 'lr:', acc_lr, 'max:', max_acc,'(at',best_epoch,')','Neu:',acc_neu, 'Ent:',acc_ent, 'Contr:',acc_contr if max_acc > pre_max: write_feature_train=open(rootPath+'train_feature_'+str(max_acc)+'.txt', 'w') write_feature_test=open(rootPath+'test_feature_'+str(max_acc)+'.txt', 'w') for i in range(len(train_features)): write_feature_train.write(' '.join(map(str, train_features[i]))+'\n') for i in range(len(test_features)): write_feature_test.write(' '.join(map(str, test_features[i]))+'\n') write_feature_train.close() write_feature_test.close() print 'features stored over' pre_max=max_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=7, maxSentLength=60, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.00005, update_freq=10, norm_threshold=5.0): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, vocab_size=load_msr_corpus(rootPath+'vocab.txt', rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2,:] indices_train_r=indices_train[1::2,:] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2,:] indices_test_r=indices_test[1::2,:] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] n_train_batches=indices_train_l.shape[0]/batch_size n_test_batches=indices_test_l.shape[0]/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l=T.cast(indices_train_l, 'int32') indices_train_r=T.cast(indices_train_r, 'int32') indices_test_l=T.cast(indices_test_l, 'int32') indices_test_r=T.cast(indices_test_r, 'int32') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size)) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.imatrix('x_index_l') # now, x is the index matrix, must be integer x_index_r = T.imatrix('x_index_r') y = T.ivector('y') left_l=T.iscalar() right_l=T.iscalar() left_r=T.iscalar() right_r=T.iscalar() length_l=T.iscalar() length_r=T.iscalar() norm_length_l=T.dscalar() norm_length_r=T.dscalar() mts=T.dmatrix() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_para=[conv_W, conv_b] layer1=Average_Pooling(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1, window_size=window_width, maxSentLength=maxSentLength) conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1])) layer2_l = Conv_with_input_para(rng, input=layer1.output_tensor_l, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b) layer2_r = Conv_with_input_para(rng, input=layer1.output_tensor_r, image_shape=(batch_size, 1, nkerns[0], ishape[1]), filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b) layer2_para=[conv2_W, conv2_b] layer3=Average_Pooling_for_batch1(rng, input_l=layer2_l.output, input_r=layer2_r.output, kern=nkerns[1], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) layer3_out=debug_print(layer3.output_simi, 'layer1_out') #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh) sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) #norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) #norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) ''' uni_cosine=cosine(sum_uni_l, sum_uni_r) linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer4_input=T.concatenate([mts, eucli_1,layer1.output_eucli, layer3_out,len_l, len_r], axis=1)#, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer4=LogisticRegression(rng, input=layer4_input, n_in=15+3+2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((layer4.W** 2).sum()+(conv2_W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum() cost_this =debug_print(layer4.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost') test_model = theano.function([index], [layer4.errors(y), layer4.y_pred], givens={ x_index_l: indices_test_l[index: index + batch_size], x_index_r: indices_test_r[index: index + batch_size], y: testY[index: index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index: index + batch_size]}, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer4.params+ layer2_para+ layer0_para# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') #norm=T.sqrt((grad_i**2).sum()) #if T.lt(norm_threshold, norm): # print 'big norm' # grad_i=grad_i*(norm_threshold/norm) acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([index], [cost,layer4.errors(y), layer4_input], updates=updates, givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size]}, on_unused_input='ignore') train_model_predict = theano.function([index], [cost_this,layer4.errors(y)], givens={ x_index_l: indices_train_l[index: index + batch_size], x_index_r: indices_train_r[index: index + batch_size], y: trainY[index: index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index: index + batch_size]}, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 #shuffle(train_batch_start)#shuffle training data for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) if iter%update_freq != 0: cost_ij, error_ij=train_model_predict(batch_start) #print 'cost_ij: ', cost_ij cost_tmp+=cost_ij error_sum+=error_ij else: cost_average, error_ij, layer3_input= train_model(batch_start) #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq) error_sum=0 cost_tmp=0#reset for the next batch #print layer3_input #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_losses=[] for i in test_batch_start: test_loss, pred_y=test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' #print 'validating & testing...' # compute zero-one loss on validation set validation_losses = [] for i in dev_batch_start: time.sleep(0.5) validation_losses.append(validate_model(i)) #validation_losses = [validate_model(i) for i in dev_batch_start] this_validation_loss = numpy.mean(validation_losses) print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index , n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [test_model(i) for i in test_batch_start] test_score = numpy.mean(test_losses) print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_score * 100.)) ''' if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A1_input = embeddings[index_A1.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1])) layer0_para=[conv_W, conv_b] conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) layer2_para=[conv2_W, conv2_b] high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # this part decides nkern[0] and nkern[1] must be in the same dimension highW_para=[high_W, high_b] params = layer2_para+layer0_para+highW_para#+[embeddings] layer0_D = Conv_with_input_para(rng, input=layer0_D_input, image_shape=(maxDocLength, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A1 = Conv_with_input_para(rng, input=layer0_A1_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b) layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input, image_shape=(batch_size, 1, ishape[0], ishape[1]),
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, word_nkerns=300, batch_size=1, window_width=[3,3], emb_size=300, margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, max_relation_len=6, max_Q_len=30, neg_all=100, train_size=69967, test_size=19953, mark='_RC_newdata'): #train_size=75909, test_size=17386 # maxSentLength=max_truncate+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/home/wyin/Datasets/SimpleQuestions_v2/relation_classification/' triple_files=['train.replace_ne.withpoolwenpengFormat.txt', 'test.replace_ne.withpoolwenpengFormat.txt'] rng = numpy.random.RandomState(23455) datasets, datasets_test, length_per_example_train, length_per_example_test, vocab_size=load_train(triple_files[0], triple_files[1], max_relation_len, max_Q_len, train_size, test_size, mark)#max_char_len, max_des_len, max_relation_len, max_Q_len print 'vocab_size:', vocab_size train_data=datasets # valid_data=datasets[1] test_data=datasets_test # result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores) # train_relations=train_data[0] train_relation_lengths=train_data[1] train_remainQ_word_ids=train_data[2] train_remainQ_word_len=train_data[3] test_relations=test_data[0] test_relation_lengths=test_data[1] test_remainQ_word_ids=test_data[2] test_remainQ_word_len=test_data[3] train_sizes=[len(train_relations),len(train_relation_lengths),len(train_remainQ_word_ids), len(train_remainQ_word_len)] if sum(train_sizes)/len(train_sizes)!=train_size: print 'weird size:', train_sizes exit(0) test_sizes=[len(test_relations),len(test_relation_lengths), len(test_remainQ_word_ids),len(test_remainQ_word_len)] if sum(test_sizes)/len(test_sizes)!=test_size: print 'weird size:', test_sizes exit(0) n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size # indices_train_pos_entity_char=theano.shared(numpy.asarray(train_pos_entity_char, dtype='int32'), borrow=True) # indices_train_pos_entity_des=theano.shared(numpy.asarray(train_pos_entity_des, dtype='int32'), borrow=True) # indices_train_relations=theano.shared(numpy.asarray(train_relations, dtype='int32'), borrow=True) # indices_train_entity_char_lengths=theano.shared(numpy.asarray(train_entity_char_lengths, dtype='int32'), borrow=True) # indices_train_entity_des_lengths=theano.shared(numpy.asarray(train_entity_des_lengths, dtype='int32'), borrow=True) # indices_train_relation_lengths=theano.shared(numpy.asarray(train_relation_lengths, dtype='int32'), borrow=True) # indices_train_mention_char_ids=theano.shared(numpy.asarray(train_mention_char_ids, dtype='int32'), borrow=True) # indices_train_remainQ_word_ids=theano.shared(numpy.asarray(train_remainQ_word_ids, dtype='int32'), borrow=True) # indices_train_mention_char_lens=theano.shared(numpy.asarray(train_mention_char_lens, dtype='int32'), borrow=True) # indices_train_remainQ_word_len=theano.shared(numpy.asarray(train_remainQ_word_len, dtype='int32'), borrow=True) # indices_train_entity_scores=theano.shared(numpy.asarray(train_entity_scores, dtype=theano.config.floatX), borrow=True) rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) # rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt') embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data index = T.iscalar() rel_word_ids_M=T.imatrix() rel_word_lens_M=T.imatrix() q_word_ids_f=T.ivector() q_word_lens_f=T.ivector() filter_size=(emb_size,window_width[0]) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1]) q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape) params = [embeddings,q_rel_conv_W, q_rel_conv_b] q_rel_conv_W_into_matrix=q_rel_conv_W.reshape((q_rel_conv_W.shape[0], q_rel_conv_W.shape[2]*q_rel_conv_W.shape[3])) # load_model_from_file(rootPath, params, '') def SimpleQ_matches_Triple(rel_word_ids_f,rel_word_lens_f): rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2) #q-rel q_rel_conv = Conv_with_input_para(rng, input=q_word_input, image_shape=(batch_size, 1, emb_size, max_Q_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) rel_conv = Conv_with_input_para(rng, input=rel_word_input, image_shape=(batch_size, 1, emb_size, max_relation_len), filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b) # q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2]) rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2]) q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, dim=max_Q_len+filter_size[1]-1, topk=2) overall_simi=cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling) return overall_simi simi_list, updates = theano.scan( SimpleQ_matches_Triple, sequences=[rel_word_ids_M,rel_word_lens_M]) posi_simi=simi_list[0] nega_simies=simi_list[1:] loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) loss_simi=T.sum(loss_simi_list) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((embeddings** 2).sum()+(q_rel_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg= Diversify_Reg(q_rel_conv_W_into_matrix) cost=loss_simi+L2_weight*L2_reg+Div_reg*diversify_reg #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f], [loss_simi, simi_list],on_unused_input='ignore') accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): grad_i=debug_print(grad_i,'grad_i') acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad # updates.append((acc_i, acc)) if param_i == embeddings: updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size))))) #Ada else: updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f], [loss_simi, cost],updates=updates, on_unused_input='ignore') # train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates, # givens={ # rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)), # rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)), # q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), # q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)) # # }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False best_test_accu=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 for jj in range(train_size): # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 minibatch_index=minibatch_index+1 #print batch_start train_rel_word_ids_M = numpy.asarray(train_relations[jj], dtype='int32').reshape((length_per_example_train[jj], max_relation_len)) train_rel_word_lens_M = numpy.asarray(train_relation_lengths[jj], dtype='int32').reshape((length_per_example_train[jj], 3)) train_q_word_ids_M = numpy.asarray(train_remainQ_word_ids[jj], dtype='int32')#.reshape((length_per_example_train[jj], max_Q_len)) train_q_word_lens_M = numpy.asarray(train_remainQ_word_len[jj], dtype='int32')#.reshape((length_per_example_train[jj], 3)) loss_simi_i, cost_i=train_model(train_rel_word_ids_M, train_rel_word_lens_M,train_q_word_ids_M, train_q_word_lens_M) if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+'\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i #if iter ==1: # exit(0) # if iter > 59999 and iter % 10000 == 0: test_loss=[] succ=0 for i in range(test_size): # print 'testing', i, '...' #prepare data test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int32').reshape((length_per_example_test[i], max_relation_len)) test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3)) test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int32')#.reshape((length_per_example_test[i], max_Q_len)) test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int32')#.reshape((length_per_example_test[i], 3)) loss_simi_i,simi_list_i=test_model(test_rel_word_ids_M, test_rel_word_lens_M,test_q_word_ids_M, test_q_word_lens_M) # print 'simi_list_i:', simi_list_i[:10] test_loss.append(loss_simi_i) if simi_list_i[0]>=max(simi_list_i[1:]): succ+=1 # print 'testing', i, '...acc:', succ*1.0/(i+1) succ=(succ+20610-test_size)*1.0/20610 #now, check MAP and MRR print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best ' 'model %f') % (epoch, minibatch_index, n_train_batches,succ)) if best_test_accu< succ: best_test_accu=succ store_model_to_file(rootPath, params, mark) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))