def classifier(rng,common_input_l,common_input_r,sents_mask_l, sents_mask_r,drop_conv_W_2_pre,conv_b_2_pre,drop_conv_W_2_gate,conv_b_2_gate,drop_conv_W_2,conv_b_2,drop_conv_W_2_context, conv_b_2_context,labels): conv_layer_2_gate_l = Conv_with_Mask_with_Gate(rng, input_tensor3=common_input_l, mask_matrix = sents_mask_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=drop_conv_W_2_pre, b=conv_b_2_pre, W_gate =drop_conv_W_2_gate, b_gate=conv_b_2_gate ) conv_layer_2_gate_r = Conv_with_Mask_with_Gate(rng, input_tensor3=common_input_r, mask_matrix = sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=drop_conv_W_2_pre, b=conv_b_2_pre, W_gate =drop_conv_W_2_gate, b_gate=conv_b_2_gate ) l_input_4_att = conv_layer_2_gate_l.output_tensor3#conv_layer_2_gate_l.masked_conv_out_sigmoid*conv_layer_2_pre_l.masked_conv_out+(1.0-conv_layer_2_gate_l.masked_conv_out_sigmoid)*common_input_l r_input_4_att = conv_layer_2_gate_r.output_tensor3#conv_layer_2_gate_r.masked_conv_out_sigmoid*conv_layer_2_pre_r.masked_conv_out+(1.0-conv_layer_2_gate_r.masked_conv_out_sigmoid)*common_input_r conv_layer_2 = Conv_for_Pair(rng, origin_input_tensor3=common_input_l, origin_input_tensor3_r = common_input_r, input_tensor3=l_input_4_att, input_tensor3_r = r_input_4_att, mask_matrix = sents_mask_l, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r = (batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]), filter_shape_context=(hidden_size[1], 1,hidden_size[0], 1), W=drop_conv_W_2, b=conv_b_2, W_context=drop_conv_W_2_context, b_context=conv_b_2_context) attentive_sent_embeddings_l_2 = conv_layer_2.attentive_maxpool_vec_l attentive_sent_embeddings_r_2 = conv_layer_2.attentive_maxpool_vec_r # attentive_sent_sumpool_l_2 = conv_layer_2.attentive_sumpool_vec_l # attentive_sent_sumpool_r_2 = conv_layer_2.attentive_sumpool_vec_r HL_layer_1_input = T.concatenate([attentive_sent_embeddings_l_2,attentive_sent_embeddings_r_2, attentive_sent_embeddings_l_2*attentive_sent_embeddings_r_2],axis=1) HL_layer_1_input_size = hidden_size[1]*3#+extra_size#+(maxSentLen*2+10*2)#+hidden_size[1]*3+1 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.nnet.relu) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[0], activation=T.nnet.relu) LR_input_size=HL_layer_1_input_size+2*hidden_size[0] U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] LR_input=T.tanh(T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1)) layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. return loss, LR_para+HL_layer_1.params+HL_layer_2.params, layer_LR.p_y_given_x, layer_LR.errors(labels)
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5], char_filter_size=4, margin=0.5, max_EM=50.302743615): test_batch_size=batch_size*10 model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) word2id={} char2id={} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len) train_size=len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size=len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices= T.ivector() #batch, one gold word for each sample para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask=T.fmatrix() char_q_mask=T.fmatrix() # true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] true_p_len = paragraph.shape[1] common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size)) char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len)) char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len)) char_LSTM_para_dict=create_LSTM_para(rng, char_emb_size, char_emb_size) char_LSTM_para_dict_bw=create_LSTM_para(rng, char_emb_size, char_emb_size) char_lstm_layer_p=Bd_LSTM_Batch_Tensor_Input_with_Mask(char_common_input_p.dimshuffle(0,2,1), char_p_masks, char_emb_size, char_LSTM_para_dict,char_LSTM_para_dict_bw) char_word_embeddings_p = char_lstm_layer_p.output_sent_rep_conc.reshape((true_batch_size, true_p_len, 2*char_emb_size)).dimshuffle(0, 2,1) #(batch, 2*hidden) char_lstm_layer_q=Bd_LSTM_Batch_Tensor_Input_with_Mask(char_common_input_q.dimshuffle(0,2,1), char_q_masks, char_emb_size, char_LSTM_para_dict,char_LSTM_para_dict_bw) char_word_embeddings_q = char_lstm_layer_q.output_sent_rep_conc.reshape((true_batch_size, q_len_limit, 2*char_emb_size)).dimshuffle(0, 2,1) #(batch, 2*hidden) LSTM_para_dict=create_LSTM_para(rng, 2*char_emb_size+emb_size,hidden_size) #40+300 LSTM_para_dict_bw=create_LSTM_para(rng, 2*char_emb_size+emb_size,hidden_size) p_input2lstm = T.concatenate([common_input_p.dimshuffle(0,2,1), char_word_embeddings_p], axis=1) #(batch, emb_size+char_emb_size, p_len) q_input2lstm = T.concatenate([common_input_q.dimshuffle(0,2,1), char_word_embeddings_q], axis=1) #(batch, emb_size+char_emb_size, p_len) lstm_layer_p=Bd_LSTM_Batch_Tensor_Input_with_Mask(p_input2lstm, para_mask, hidden_size, LSTM_para_dict,LSTM_para_dict_bw) p_tensor3 = lstm_layer_p.output_tensor_conc #(batch, 2*hidden, p_len) lstm_layer_q=Bd_LSTM_Batch_Tensor_Input_with_Mask(q_input2lstm, q_mask, hidden_size, LSTM_para_dict,LSTM_para_dict_bw) q_reps = lstm_layer_q.output_sent_rep_conc #(batch, 2*hidden) NN_para=char_LSTM_para_dict.values()+char_LSTM_para_dict_bw.values()+LSTM_para_dict.values()+LSTM_para_dict_bw.values() input4score = T.concatenate([p_tensor3, T.repeat(q_reps.dimshuffle(0,1,'x'), true_p_len, axis=2)], axis=1) #(batch, 4*hidden, p_len) HL_1_para = create_ensemble_para(rng, hidden_size, 4*hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a=normalize_matrix(U_a) norm_HL_1_para=normalize_matrix(HL_1_para) norm_HL_2_para=normalize_matrix(HL_2_para) norm_HL_3_para=normalize_matrix(HL_3_para) norm_HL_4_para=normalize_matrix(HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, true_batch_size,true_p_len) span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len) loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(true_batch_size), gold_indices])) #ranking loss tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((true_batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor(index_matrix[T.arange(true_batch_size), gold_indices], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega)) loss = loss_neg_likelihood + loss_rank #test mask_test_return=T.argmax(span_scores_matrix*para_mask, axis=1) #batch params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask], mask_test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_ids[para_id:para_id+batch_size] cost_i+= train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch]) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # pred_dict={} q_amount=0 p1=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( test_para_list[test_para_id:test_para_id+test_batch_size], test_Q_list[test_para_id:test_para_id+test_batch_size], test_para_mask[test_para_id:test_para_id+test_batch_size], test_Q_mask[test_para_id:test_para_id+test_batch_size], test_para_char_list[test_para_id:test_para_id+test_batch_size], test_Q_char_list[test_para_id:test_para_id+test_batch_size], test_para_char_mask[test_para_id:test_para_id+test_batch_size], test_Q_char_mask[test_para_id:test_para_id+test_batch_size]) # test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size] test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] q_amount+=test_batch_size for q in range(test_batch_size): #for each question predict_id = batch_predict_ids[q] ground_ids=test_label_batch[q] if predict_id in set(ground_ids): p1+=1 # print batch_predict_ids[q], mask_batch_predict_ids[q], test_p_len_limit - numpy.sum(test_para_mask[test_para_id+q]), scores_i[q], test_para_mask[test_para_id+q] exact_acc = p1*100.0/q_amount if exact_acc> max_exact_acc: max_exact_acc=exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print '\t\tcurrent exact:', exact_acc, '\t\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=10, test_batch_size=200, emb_size=300, hidden_size=100, L2_weight=0.0001, para_len_limit=300, q_len_limit=30, max_EM=40.0): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) # glove_vocab=set(word2vec.keys()) train_para_list, train_Q_list, train_start_list,train_end_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train_AI2(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_start_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test_AI2(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in overall_word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') # labels = T.imatrix('labels') #(batch, para_len) start_indices= T.ivector() #batch end_indices = T.ivector() #batch para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size=paragraph.shape[0] norm_extraF=normalize_matrix(extraF) fwd_para=create_LSTM_para(rng, emb_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_para=create_LSTM_para(rng, emb_size, hidden_size) paragraph_para=fwd_para.values()+ bwd_para.values() fwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size) paragraph_para_e1=fwd_e1.values()+ bwd_e1.values() fwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size) paragraph_para_e11=fwd_e11.values()+ bwd_e11.values() fwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size) paragraph_para_e2=fwd_e2.values()+ bwd_e2.values() # U_e2, W_e2, b_e2=create_GRU_para(rng, hidden_size, hidden_size) # U_e2_b, W_e2_b, b_e2_b=create_GRU_para(rng, hidden_size, hidden_size) # paragraph_para_e2=[U_e2, W_e2, b_e2, U_e2_b, W_e2_b, b_e2_b] # fwd_Q=create_LSTM_para(rng, emb_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) # bwd_Q=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_Q.values()+ bwd_Q.values() # W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) # W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a1 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features U_a2 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features U_a3 = create_ensemble_para(rng, 1, 6*hidden_size) # 3 extra features # LR_b = theano.shared(value=numpy.zeros((2,), # dtype=theano.config.floatX), # @UndefinedVariable # name='LR_b', borrow=True) HL_paras=[U_a1, U_a2, U_a3] params = [embeddings]+paragraph_para+paragraph_para_e1+paragraph_para_e11+HL_paras+paragraph_para_e2 # load_model_from_file(rootPath+'Best_Paras_AI2_31.210974456', params) paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) #self, X, Mask, hidden_dim, fwd_tparams, bwd_tparams paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_para, bwd_tparams= bwd_para) para_reps=paragraph_model.output_tensor #(batch, 2*hidden, para_len) Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, fwd_tparams=fwd_para, bwd_tparams= bwd_para) questions_reps_tensor=questions_model.output_tensor #(batch, 2*hidden ,q_len) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden) # questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #(batch, para_len, hidden) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) norm_U_a3=normalize_matrix(U_a3) def example_in_batch(para_matrix, q_matrix): #assume both are (2*hidden, len) repeat_para_matrix_T=T.repeat(para_matrix.T, q_matrix.shape[1], axis=0) #(para_len*q_len, 2*hidden) repeat_q_matrix_3D = T.repeat(q_matrix.T.dimshuffle('x',0,1), para_matrix.shape[1], axis=0) #(para_len, q_len, 2*hidden) repeat_q_matrix_T= repeat_q_matrix_3D.reshape((repeat_q_matrix_3D.shape[0]*repeat_q_matrix_3D.shape[1], repeat_q_matrix_3D.shape[2])) #(para_len*q_len, 2*hidden) ele_mult =repeat_para_matrix_T*repeat_q_matrix_T #(#(para_len*q_len, 2*hidden)) overall_concv = T.concatenate([repeat_para_matrix_T, repeat_q_matrix_T, ele_mult], axis=1) ##(para_len*q_len, 6*hidden) scores=T.dot(overall_concv, norm_U_a3) #(para_len*q_len,1) interaction_matrix=scores.reshape((para_matrix.shape[1], q_matrix.shape[1])) #(para_len, q_len) # transpose_para_matrix=para_matrix.T # interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix=T.nnet.softmax(interaction_matrix) # norm_interaction_matrix=T.maximum(0.0, interaction_matrix) q_by_para = T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(2*hidden, para_len) para_by_q = T.repeat(T.dot(para_matrix, T.nnet.softmax(T.max(interaction_matrix, axis=1).dimshuffle('x',0)).T), para_matrix.shape[1], axis=1) return (q_by_para, para_by_q) inter_return, updates = theano.scan(fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor]) #batch_q_reps (batch, hidden, para_len) batch_q_reps=inter_return[0] #(batch, 2*hidden, para_len) batch_para_reps=inter_return[1] #(batch, 2*hidden , para_len) #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len) ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps,para_reps*batch_q_reps, para_reps*batch_para_reps], axis=1) #(batch, 4*2*hidden, para_len) questions_reps.dimshuffle(0,2,1) para_ensemble_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e1, bwd_tparams= bwd_e1) para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, 2*hidden ,para_len) para_ensemble_model1=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e11, bwd_tparams= bwd_e11) para_reps_tensor4score1=para_ensemble_model1.output_tensor #(batch, 2*hidden ,para_len) Con_G_M=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score1], axis=1) #(batch, 10*hidden, para_len) #score for each para word norm_U_a=normalize_matrix(U_a1) start_scores=T.dot(Con_G_M.dimshuffle(0,2,1), norm_U_a) #(batch, para_len, 1) start_scores=T.nnet.softmax(start_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len) # para_reps_tensor4score = T.concatenate([para_reps_tensor4score, start_scores.dimshuffle(0,'x',1)], axis=1) para_ensemble_model2=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score1, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e2, bwd_tparams= bwd_e2) para_reps_tensor4score2=para_ensemble_model2.output_tensor #(batch, 2*hidden ,para_len) Con_G_M2=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score2], axis=1) #(batch, 10*hidden, para_len) norm_U_a2=normalize_matrix(U_a2) end_scores=T.dot(Con_G_M2.dimshuffle(0,2,1), norm_U_a2) #(batch, para_len, 1) end_scores=T.nnet.softmax(end_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len) #loss train loss=-T.mean(T.log(start_scores[T.arange(true_batch_size), start_indices])+T.log(end_scores[T.arange(true_batch_size), end_indices])) #test co_simi_batch_matrix=T.batched_dot((para_mask*start_scores).dimshuffle(0,1,'x'), (para_mask*end_scores).dimshuffle(0,'x',1)) #(batch, para_len, para_len) #reset lower dialgonal cols = numpy.concatenate([numpy.array(range(i), dtype=numpy.uint) for i in xrange(para_len_limit)]) rows = numpy.concatenate([numpy.array([i]*i, dtype=numpy.uint) for i in xrange(para_len_limit)]) c = T.set_subtensor(co_simi_batch_matrix[:,rows, cols], theano.shared(numpy.zeros(para_len_limit*(para_len_limit-1)/2))) #reset longer than 7 size cols2 = numpy.concatenate([numpy.array(range(i+7,para_len_limit), dtype=numpy.uint) for i in xrange(para_len_limit-7)]) rows2 = numpy.concatenate([numpy.array([i]*(para_len_limit-7-i), dtype=numpy.uint) for i in xrange(para_len_limit-7)]) c2 = T.set_subtensor(c[:,rows2, cols2], theano.shared(numpy.zeros((para_len_limit-7)*(para_len_limit-6)/2))) test_return=T.argmax(c2.reshape((true_batch_size, para_len_limit*para_len_limit)), axis=1) #batch #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,start_indices, end_indices,para_mask, q_mask, extraF], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_start_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_end_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX)) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') # writefile.write('{') pred_dict={} # exact_match=0.0 # F1_match=0.0 q_amount=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size] # para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size] q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] # print 'q_ids_batch:', q_ids_batch # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] # para_len=len(test_para_wordlist_list[0]) # if para_len!=len(distribution_matrix[0]): # print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) # exit(0) # q_size=len(distribution_matrix) q_amount+=test_batch_size # print q_size # print test_para_word_list # Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size] for q in range(test_batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=decode_predict_id_AI2(batch_predict_ids[q], para_len_limit, test_para_wordlist_list[q]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans # writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ') # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) # q_gold_ans_set=para_gold_ansset_list[q] # # print test_para_wordlist_list[q] # # print Q_list_inword[q] # # print pred_ans.encode('utf8'), q_gold_ans_set # if pred_ans in q_gold_ans_set: # exact_match+=1 # F1=MacroF1(pred_ans, q_gold_ans_set) # F1_match+=F1 with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') # F1_acc=F1_match/q_amount # exact_acc=exact_match/q_amount if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Paras_AI2_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc # os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt') if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=40, cand_size=10, hidden_size=[300, 300], max_pred_pick=5): model_options = locals().copy() print "model options", model_options pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'} seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train( sent_len, claim_len, cand_size) train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev( sent_len, claim_len, cand_size, word2id) test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) train_sents = np.asarray(train_sents, dtype='int32') train_3th_sents = np.asarray(train_3th_sents, dtype='int32') joint_train_sents = np.concatenate((train_sents, train_3th_sents)) test_sents = np.asarray(test_sents, dtype='int32') test_3th_sents = np.asarray(test_3th_sents, dtype='int32') joint_test_sents = np.concatenate((test_sents, test_3th_sents)) train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX) train_3th_sent_masks = np.asarray(train_3th_sent_masks, dtype=theano.config.floatX) joint_train_sent_masks = np.concatenate( (train_sent_masks, train_3th_sent_masks)) test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX) test_3th_sent_masks = np.asarray(test_3th_sent_masks, dtype=theano.config.floatX) joint_test_sent_masks = np.concatenate( (test_sent_masks, test_3th_sent_masks)) train_sent_labels = np.asarray(train_sent_labels, dtype='int32') train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32') joint_train_sent_labels = np.concatenate( (train_sent_labels, train_3th_sent_labels)) test_sent_labels = np.asarray(test_sent_labels, dtype='int32') test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32') joint_test_sent_labels = np.concatenate( (test_sent_labels, test_3th_sent_labels)) train_claims = np.asarray(train_claims, dtype='int32') train_3th_claims = np.asarray(train_3th_claims, dtype='int32') joint_train_claims = np.concatenate((train_claims, train_3th_claims)) test_claims = np.asarray(test_claims, dtype='int32') test_3th_claims = np.asarray(test_3th_claims, dtype='int32') joint_test_claims = np.concatenate((test_claims, test_3th_claims)) train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX) train_3th_claim_mask = np.asarray(train_3th_claim_mask, dtype=theano.config.floatX) joint_train_claim_mask = np.concatenate( (train_claim_mask, train_3th_claim_mask)) test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX) test_3th_claim_mask = np.asarray(test_3th_claim_mask, dtype=theano.config.floatX) joint_test_claim_mask = np.concatenate( (test_claim_mask, test_3th_claim_mask)) train_labels = np.asarray(train_labels, dtype='int32') train_3th_labels = np.asarray(train_3th_labels, dtype='int32') joint_train_labels = np.concatenate((train_labels, train_3th_labels)) test_labels = np.asarray(test_labels, dtype='int32') test_3th_labels = np.asarray(test_3th_labels, dtype='int32') joint_test_labels = np.concatenate((test_labels, test_3th_labels)) joint_train_size = len(joint_train_claims) joint_test_size = len(joint_test_claims) train_size = len(train_claims) test_size = len(test_claims) test_3th_size = len(test_3th_claims) vocab_size = len(word2id) + 1 print 'joint_train size: ', joint_train_size, ' joint_test size: ', joint_test_size print 'train size: ', train_size, ' test size: ', test_size print 'vocab size: ', vocab_size rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids = T.itensor3() #(batch, cand_size, sent_len) sents_mask = T.ftensor3() sents_labels = T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.fmatrix() joint_sents_ids = T.itensor3() #(batch, cand_size, sent_len) joint_sents_mask = T.ftensor3() joint_sents_labels = T.imatrix() #(batch, cand_size) joint_claim_ids = T.imatrix() #(batch, claim_len) joint_claim_mask = T.fmatrix() joint_labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' embed_input_sents = init_embeddings[sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim = init_embeddings[claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) att_conv_W, att_conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [conv_W, conv_b, att_conv_W, att_conv_b, conv_W_context] conv_model_sents = Conv_with_Mask( rng, input_tensor3=embed_input_sents, mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_sent_emb = sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) conv_model_claims = Conv_with_Mask( rng, input_tensor3=embed_input_claim, mask_matrix=claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero claim_embeddings = conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1), cand_size, axis=1) # concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb ], axis=2) # concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2)) concate_claim_sent = T.concatenate([ batch_claim_emb, batch_sent_emb, T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x') ], axis=2) concate_2_matrix = concate_claim_sent.reshape( (batch_size * cand_size, hidden_size[0] * 2 + 1)) LR_input = concate_2_matrix LR_input_size = hidden_size[0] * 2 + 1 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 1, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para = [U_a] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(LR_input.dot(U_a)) #batch * 12 inter_matrix = score_matrix.reshape((batch_size, cand_size)) # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1) # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size))) ''' maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix) ''' # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix) # loss = -T.mean(T.log(prob_pos)) #f1 as loss batch_overlap = T.sum(sents_labels * inter_matrix, axis=1) batch_recall = batch_overlap / T.sum(sents_labels, axis=1) batch_precision = batch_overlap / T.sum(inter_matrix, axis=1) batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall + batch_precision) loss = -T.mean(T.log(batch_f1)) # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean() ''' training task2, predict 3 labels ''' joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM joint_embed_input_claim = init_embeddings[ joint_claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) joint_conv_model_sents = Conv_with_Mask( rng, input_tensor3=joint_embed_input_sents, mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_sent_embeddings = joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_batch_sent_emb = joint_sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) joint_premise_emb = T.sum(joint_batch_sent_emb * joint_sents_labels.dimshuffle(0, 1, 'x'), axis=1) #(batch, hidden_size) joint_conv_model_claims = Conv_with_Mask( rng, input_tensor3=joint_embed_input_claim, mask_matrix=joint_claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_claim_embeddings = joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_premise_hypo_emb = T.concatenate( [joint_premise_emb, joint_claim_embeddings], axis=1) #(batch, 2*hidden_size) ''' attentive conv in task2 ''' joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) joint_sents_dot = T.batched_dot( joint_sents_tensor3, joint_sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) joint_sents_dot_2_matrix = T.nnet.softmax( joint_sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) joint_sents_context = T.batched_dot( joint_sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), joint_sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= joint_add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0), mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle( 0, 1, 'x') masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle( 0, 1, 'x') fine_max = T.concatenate([ T.max(masked_sents_attconv, axis=1), T.max(masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) "Logistic Regression layer" joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1) joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0] joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3) joint_LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class joint_LR_para = [joint_U_a, joint_LR_b] joint_layer_LR = LogisticRegression( rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector joint_loss = joint_layer_LR.negative_log_likelihood( joint_labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' # binarize_prob = T.where( inter_matrix > 0.5, 1, 0) #(batch_size, cand_size masked_inter_matrix = inter_matrix * sents_labels #(batch, cand_size) test_premise_emb = T.sum(batch_sent_emb * masked_inter_matrix.dimshuffle(0, 1, 'x'), axis=1) test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings], axis=1) #fine-maxsum sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) sents_dot_2_matrix = T.nnet.softmax( sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) sents_context = T.batched_dot( sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) add_sents_context = embed_input_sents + sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle( 0, 1, 'x') test_fine_max = T.concatenate([ T.max(test_masked_sents_attconv, axis=1), T.max(test_masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max], axis=1) test_LR_input_size = joint_LR_input_size test_layer_LR = LogisticRegression( rng, input=test_LR_input, n_in=test_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector params = [init_embeddings] + NN_para + LR_para + joint_LR_para cost = loss + joint_loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels ], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_labels ], [ inter_matrix, test_layer_LR.errors(joint_labels), test_layer_LR.y_pred ], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False joint_n_train_batches = joint_train_size / batch_size joint_train_batch_start = list( np.arange(joint_n_train_batches) * batch_size) + [joint_train_size - batch_size] n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] n_test_3th_batches = test_3th_size / batch_size test_3th_batch_start = list(np.arange(n_test_3th_batches) * batch_size) + [test_3th_size - batch_size] max_acc = 0.0 max_test_f1 = 0.0 max_acc_full_evi = 0.0 cost_i = 0.0 joint_train_indices = range(joint_train_size) train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( joint_train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed random.Random(100).shuffle(train_indices) iter_accu = 0 for joint_batch_id in joint_train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1 iter_accu += 1 joint_train_id_batch = joint_train_indices[ joint_batch_id:joint_batch_id + batch_size] for i in range(3): batch_id = random.choice(train_batch_start) train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model( train_sents[train_id_batch], train_sent_masks[train_id_batch], train_sent_labels[train_id_batch], train_claims[train_id_batch], train_claim_mask[train_id_batch], #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels joint_train_sents[joint_train_id_batch], joint_train_sent_masks[joint_train_id_batch], joint_train_sent_labels[joint_train_id_batch], joint_train_claims[joint_train_id_batch], joint_train_claim_mask[joint_train_id_batch], joint_train_labels[joint_train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0): if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() f1_sum = 0.0 error_sum = 0.0 full_evi = 0 predictions = [] for test_batch_id in test_batch_start: # for each test batch batch_prob, error_i, pred_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_sent_masks[test_batch_id:test_batch_id + batch_size], test_sent_labels[test_batch_id:test_batch_id + batch_size], test_claims[test_batch_id:test_batch_id + batch_size], test_claim_mask[test_batch_id:test_batch_id + batch_size], test_labels[test_batch_id:test_batch_id + batch_size]) error_sum += error_i batch_sent_labels = test_sent_labels[ test_batch_id:test_batch_id + batch_size] batch_sent_names = test_sent_names[ test_batch_id:test_batch_id + batch_size] batch_ground_names = test_ground_names[ test_batch_id:test_batch_id + batch_size] batch_ground_labels = test_labels[ test_batch_id:test_batch_id + batch_size] for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get( batch_ground_labels[i]) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped = [(batch_prob[i, k], batch_sent_labels[i][k], batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary if triple[0] >0.5: can control the recall, influence the strict_acc ''' if triple[0] > 0.5: # pred_sent_names.append(batch_sent_names[i][j]) pred_sent_names.append(triple[2]) # if len(pred_sent_names) == max_pred_pick: # break instance_i['predicted_evidence'] = pred_sent_names # print 'pred_sent_names:',pred_sent_names # print 'gold_sent_names:',gold_sent_names new_gold_names = [] for gold_name in gold_sent_names: new_gold_names.append([None, None] + gold_name) instance_i['evidence'] = [new_gold_names] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 # test_f1=f1_sum/(len(test_batch_start)*batch_size) for test_batch_id in test_3th_batch_start: # for each test batch _, error_i, pred_i = test_model( test_3th_sents[test_batch_id:test_batch_id + batch_size], test_3th_sent_masks[test_batch_id:test_batch_id + batch_size], test_3th_sent_labels[test_batch_id:test_batch_id + batch_size], test_3th_claims[test_batch_id:test_batch_id + batch_size], test_3th_claim_mask[test_batch_id:test_batch_id + batch_size], test_3th_labels[test_batch_id:test_batch_id + batch_size]) for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get(2) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) instance_i['predicted_evidence'] = [] instance_i['evidence'] = [] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.01, n_epochs=10, L2_weight=0.000001, extra_size=4, emb_size=300, posi_emb_size=50, batch_size=50, filter_size=[3, 3], maxSentLen=50, hidden_size=300): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SciTailV1_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(all_sentences_l[2], dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(all_sentences_r[2], dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(all_labels[2], dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable posi_rand_values = rng.normal( 0.0, 0.01, (maxSentLen, posi_emb_size)) #generate a matrix by Gaussian distribution posi_embeddings = theano.shared( value=np.array(posi_rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) embed_input_l = embed_input( init_embeddings, sents_ids_l ) #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = embed_input( init_embeddings, sents_ids_r ) #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) '''create_AttentiveConv_params ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[0])) conv_W_posi, conv_b_posi = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + posi_emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size, 1)) NN_para = [conv_W, conv_b, conv_W_posi, conv_b_posi, conv_W_context] ''' attentive convolution function ''' attentive_conv_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size, 1, emb_size, 1), W=conv_W, b=conv_b, W_posi=conv_W_posi, b_posi=conv_b_posi, W_context=conv_W_context, b_context=conv_b_context, posi_emb_matrix=posi_embeddings, posi_emb_size=posi_emb_size) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r sent_embeddings_l = attentive_conv_layer.maxpool_vec_l sent_embeddings_r = attentive_conv_layer.maxpool_vec_r "form input to LR classifier" LR_input = T.concatenate([ sent_embeddings_l, sent_embeddings_r, sent_embeddings_l * sent_embeddings_r, attentive_sent_embeddings_l, attentive_sent_embeddings_r, attentive_sent_embeddings_l * attentive_sent_embeddings_r ], axis=1) LR_input_size = 6 * hidden_size U_a = create_ensemble_para( rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [init_embeddings, posi_embeddings] + NN_para + LR_para # L2_reg = (init_embeddings**2).sum()+(conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum() cost = loss #+L2_weight*L2_reg updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 max_f1 = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() dev_error_sum = 0.0 for dev_batch_id in dev_batch_start: # for each test batch dev_error_i = dev_model( dev_sents_l[dev_batch_id:dev_batch_id + batch_size], dev_masks_l[dev_batch_id:dev_batch_id + batch_size], dev_sents_r[dev_batch_id:dev_batch_id + batch_size], dev_masks_r[dev_batch_id:dev_batch_id + batch_size], dev_labels_store[dev_batch_id:dev_batch_id + batch_size]) dev_error_sum += dev_error_i dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start)) if dev_acc > max_acc_dev: max_acc_dev = dev_acc print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i test_acc = 1.0 - error_sum / (len(test_batch_start)) if test_acc > max_acc_test: max_acc_test = test_acc store_model_to_file( '/home/wenpeng/workspace/SciTail/src/model_para_' + str(max_acc_test), params) print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test else: print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=500, test_batch_size=1000, emb_size=300, hidden_size=300, HL_hidden_size=200, L2_weight=0.0001, train_size=None, test_size=None, batch_size_pred=1000, para_len=60, question_len=20, c_len=7, e_len=2): model_options = locals().copy() print "model options", model_options rootPath='/mounts/work/hs/yin/20161219/'; storePath='/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = np.random.RandomState(23455) word2id={} word2id['UNK']=0 # use it to pad word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_e_ids,train_e_masks,train_c_ids,train_c_masks, train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len, e_len, c_len, word2id, rootPath+'squadnewtrn.txt') word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_e_ids,test_e_masks,test_c_ids,test_c_masks, test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, e_len, c_len,word2id, rootPath+'squadnewdev.txt') print 'word2id size for bigger dataset:', len(word2id) word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_e_ids,train_e_masks,train_c_ids,train_c_masks, train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len,e_len, c_len, word2id, rootPath+'squadnewtrn,subset.000.txt') word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_e_ids,test_e_masks,test_c_ids,test_c_masks, test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, e_len, c_len,word2id, rootPath+'squadnewdev,subset.000.txt') print 'word2id size for smaller dataset:', len(word2id) # if len(train_questions)!=train_size or len(test_questions)!=test_size: # print 'len(questions)!=train_size or len(test_questions)!=test_size:', len(train_questions),train_size,len(test_questions),test_size # exit(0) train_size=len(train_questions) test_size = len(test_questions) train_questions = np.asarray(train_questions, dtype='int32') # print train_questions[:10,:] # exit(0) train_questions_mask = np.asarray(train_questions_mask, dtype=theano.config.floatX) train_paras = np.asarray(train_paras, dtype='int32') train_paras_mask = np.asarray(train_paras_mask, dtype=theano.config.floatX) train_e_ids = np.asarray(train_e_ids, dtype='int32') train_e_masks = np.asarray(train_e_masks, dtype=theano.config.floatX) train_c_ids = np.asarray(train_c_ids, dtype='int32') train_c_masks = np.asarray(train_c_masks, dtype=theano.config.floatX) train_c_heads = np.asarray(train_c_heads, dtype='int32') train_c_tails = np.asarray(train_c_tails, dtype='int32') train_l_heads = np.asarray(train_l_heads, dtype='int32') train_l_tails = np.asarray(train_l_tails, dtype='int32') train_e_heads = np.asarray(train_e_heads, dtype='int32') train_e_tails = np.asarray(train_e_tails, dtype='int32') train_labels = np.asarray(train_labels, dtype='int32') train_labels_3c = np.asarray(train_labels_3c, dtype='int32') test_questions = np.asarray(test_questions, dtype='int32') test_questions_mask = np.asarray(test_questions_mask, dtype=theano.config.floatX) test_paras = np.asarray(test_paras, dtype='int32') test_paras_mask = np.asarray(test_paras_mask, dtype=theano.config.floatX) test_e_ids = np.asarray(test_e_ids, dtype='int32') test_e_masks = np.asarray(test_e_masks, dtype=theano.config.floatX) test_c_ids = np.asarray(test_c_ids, dtype='int32') test_c_masks = np.asarray(test_c_masks, dtype=theano.config.floatX) test_c_heads = np.asarray(test_c_heads, dtype='int32') test_c_tails = np.asarray(test_c_tails, dtype='int32') test_l_heads = np.asarray(test_l_heads, dtype='int32') test_l_tails = np.asarray(test_l_tails, dtype='int32') test_e_heads = np.asarray(test_e_heads, dtype='int32') test_e_tails = np.asarray(test_e_tails, dtype='int32') test_labels = np.asarray(test_labels, dtype='int32') overall_vocab_size=len(word2id) print 'train size:', train_size, 'test size:', test_size, 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() para=T.imatrix() #(2*batch, len) para_mask=T.fmatrix() #(2*batch, len) c_ids=T.imatrix() #(2*batch, len) c_mask=T.fmatrix() #(2*batch, len) e_ids=T.imatrix() #(2*batch, len) e_mask=T.fmatrix() #(2*batch, len) c_heads=T.ivector() #batch c_tails=T.ivector() #batch l_heads=T.ivector() #batch l_tails=T.ivector() #batch e_heads=T.ivector() #batch e_tails=T.ivector() #batch q=T.imatrix() #(2*batch, len_q) q_mask=T.fmatrix() #(2*batch, len_q) labels=T.ivector() #batch ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = para.shape[0] # U_p, W_p, b_p=create_GRU_para(rng, emb_size, hidden_size) # U_p_b, W_p_b, b_p_b=create_GRU_para(rng, emb_size, hidden_size) # GRU_p_para=[U_p, W_p, b_p, U_p_b, W_p_b, b_p_b] # # U_q, W_q, b_q=create_GRU_para(rng, emb_size, hidden_size) # U_q_b, W_q_b, b_q_b=create_GRU_para(rng, emb_size, hidden_size) # GRU_q_para=[U_q, W_q, b_q, U_q_b, W_q_b, b_q_b] paragraph_input = embeddings[para.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len) q_input = embeddings[q.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len) fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) paragraph_reps_tensor3=paragraph_model.output_tensor #(batch, 2*hidden, paralen) # paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U_p,W=W_p,b=b_p,Ub=U_p_b,Wb=W_p_b,bb=b_p_b) # paragraph_reps_tensor3=paragraph_model.output_tensor_conc #(batch, 2*hidden, para_len) fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) question_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(q_input, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) q_reps=questions_model.output_sent_rep_maxpooling #(batch, 2*hidden) # q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U_q,W=W_q,b=b_q,Ub=U_q_b,Wb=W_q_b,bb=b_q_b) # q_reps=q_model.output_sent_rep_conc #(batch, 2*hidden) #interaction batch_ids=T.arange(true_batch_size) c_heads_reps=paragraph_reps_tensor3[batch_ids,:,c_heads] #(batch, 2*hidden) c_tails_reps=paragraph_reps_tensor3[batch_ids,:,c_tails] #(batch, 2*hidden) candididates_reps=T.concatenate([c_heads_reps, c_tails_reps], axis=1) #(batch, 4*hidden) l_heads_reps=paragraph_reps_tensor3[batch_ids,:,l_heads] #(batch, 2*hidden) l_tails_reps=paragraph_reps_tensor3[batch_ids,:,l_tails] #(batch, 2*hidden) longs_reps=T.concatenate([l_heads_reps, l_tails_reps], axis=1) #(batch, 4*hidden) e_heads_reps=paragraph_reps_tensor3[batch_ids,:,e_heads] #(batch, 2*hidden) e_tails_reps=paragraph_reps_tensor3[batch_ids,:,e_tails] #(batch, 2*hidden) extensions_reps=T.concatenate([e_heads_reps, e_tails_reps], axis=1) #(batch, 4*hidden) #glove level average c_input = embeddings[c_ids.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) c_sum = T.sum(c_input*c_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size) average_C_batch = c_sum/T.sqrt(T.sum(c_sum**2, axis=1)+1e-20).dimshuffle(0,'x') e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) e_sum = T.sum(e_input*e_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size) average_E_batch = e_sum/T.sqrt(T.sum(e_sum**2, axis=1)+1e-20).dimshuffle(0,'x') # e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) q_sum = T.sum(q_input*q_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size) average_Q_batch = q_sum/T.sqrt(T.sum(q_sum**2, axis=1)+1e-20).dimshuffle(0,'x') # def submatrix_average(matrix, head, tail): # return T.mean(matrix[:, head:tail+1], axis=1) #emb_size # def submatrix_average_q(matrix, head): # return T.mean(matrix[:, head:], axis=1) #emb_size # # average_E_batch, _ = theano.scan(fn=submatrix_average, # sequences=[paragraph_input,e_heads, e_tails]) #(batch, emb_size) # average_C_batch, _ = theano.scan(fn=submatrix_average, # sequences=[paragraph_input,c_heads, c_tails]) #(batch, emb_size) # # Q_valid_len=T.cast(T.sum(q_mask, axis=1), 'int32') # # average_Q_batch, _ = theano.scan(fn=submatrix_average_q, # sequences=[q_input,-Q_valid_len]) #(batch, emb_size) #classify HL_layer_subtask_input=T.concatenate([q_reps, extensions_reps, average_E_batch, average_Q_batch], axis=1) #(batch, 6*hidden+2*emb) HL_layer_subtask_size= 6*hidden_size+2*emb_size#HL_layer_1_input_size+2*HL_hidden_size HL_layer_subtask_1=HiddenLayer(rng, input=HL_layer_subtask_input, n_in=HL_layer_subtask_size, n_out=HL_hidden_size, activation=T.tanh) HL_layer_subtask_2=HiddenLayer(rng, input=HL_layer_subtask_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh) U_subtask_a = create_ensemble_para(rng, 2, HL_hidden_size) # the weight matrix hidden_size*2 norm_U_subtask_a=normalize_matrix(U_subtask_a) LR_subtask_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_subtask_para=[U_subtask_a, LR_subtask_b] layer_LR_subtask=LogisticRegression(rng, input=HL_layer_subtask_2.output, n_in=HL_hidden_size, n_out=2, W=norm_U_subtask_a, b=LR_subtask_b) #basically it is a multiplication between weight matrix and input feature vector HL_layer_1_input_size=14*hidden_size+3*emb_size+1 #, average_E_batch, average_C_batch, average_Q_batch HL_layer_1_input = T.concatenate([q_reps, longs_reps, extensions_reps, candididates_reps, average_E_batch, average_C_batch, average_Q_batch, layer_LR_subtask.prop_for_posi.reshape((true_batch_size,1))], axis=1) #(batch, 14*hidden_size+3*emb_size+1) HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=HL_hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh) LR_input=HL_layer_2.output #T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output], axis=1) #(batch, 10*hidden) LR_input_size= HL_hidden_size#HL_layer_1_input_size+2*HL_hidden_size U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 norm_U_a=normalize_matrix(U_a) LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=norm_U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels)+layer_LR_subtask.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = LR_para+[embeddings]+paragraph_para+question_para+HL_layer_1.params+HL_layer_2.params+LR_subtask_para+HL_layer_subtask_1.params+HL_layer_subtask_2.params # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+0.0005*T.mean(U_a**2) accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([para, para_mask,c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], cost, updates=updates,on_unused_input='ignore') train_model_pred = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], layer_LR.y_pred, on_unused_input='ignore') test_model = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], [layer_LR.errors(labels),layer_LR.y_pred], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_train_batches_pred=train_size/batch_size_pred #batch_size means how many pairs train_batch_start_pred=list(np.arange(n_train_batches_pred)*batch_size_pred)+[train_size-batch_size_pred] n_test_batches=test_size/test_batch_size #batch_size means how many pairs test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_acc=0.0 cost_i=0.0 train_ids = range(train_size) train_ids_pred = range(train_size) best_test_statistic=defaultdict(int) # best_train_statistic=defaultdict(int) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) # print train_ids[:100] iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_list = train_ids[para_id:para_id+batch_size] # print 'train_labels[train_id_list]:', train_labels[train_id_list] cost_i+= train_model( train_paras[train_id_list], train_paras_mask[train_id_list], train_c_ids[train_id_list], train_c_masks[train_id_list], train_e_ids[train_id_list], train_e_masks[train_id_list], train_c_heads[train_id_list], train_c_tails[train_id_list], train_l_heads[train_id_list], train_l_tails[train_id_list], train_e_heads[train_id_list], train_e_tails[train_id_list], train_questions[train_id_list], train_questions_mask[train_id_list], train_labels[train_id_list]) #print iter if iter%10==0: #iter>=200 and print 'Epoch ', epoch, 'iter '+str(iter)+'/'+str(len(train_batch_start))+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() # print 'Training Pred...' # train_statistic=defaultdict(int) # for para_id in train_batch_start_pred: # train_id_list = train_ids_pred[para_id:para_id+batch_size_pred] # gold_train_labels_list = train_labels_3c[train_id_list] # # print 'train_id_list:', train_id_list # # print 'train_c_heads[train_id_list]:', train_c_heads[train_id_list] # train_preds_i= train_model_pred( # train_paras[train_id_list], # train_paras_mask[train_id_list], # train_c_ids[train_id_list], # train_c_masks[train_id_list], # train_e_ids[train_id_list], # train_e_masks[train_id_list], # train_c_heads[train_id_list], # train_c_tails[train_id_list], # train_l_heads[train_id_list], # train_l_tails[train_id_list], # train_e_heads[train_id_list], # train_e_tails[train_id_list], # train_questions[train_id_list], # train_questions_mask[train_id_list], # train_labels[train_id_list]) # # for ind, gold_label in enumerate(gold_train_labels_list): # train_statistic[(gold_label, train_preds_i[ind])]+=1 # train_acc= (train_statistic.get((1,1),0)+train_statistic.get((0,0),0))*1.0/(train_statistic.get((1,1),0)+train_statistic.get((0,0),0)+train_statistic.get((1,0),0)+train_statistic.get((0,1),0)) # # print '\t\tcurrnt train acc:', train_acc, ' train_statistic:', train_statistic print 'Testing...' error=0 test_statistic=defaultdict(int) for test_para_id in test_batch_start: test_id_list = range(test_para_id, test_para_id+test_batch_size) # print 'test_id_list:',test_id_list # print 'test_c_heads[test_id_list]', test_c_heads[test_id_list] gold_labels_list = test_labels_3c[test_para_id:test_para_id+test_batch_size] error_i, preds_i= test_model( test_paras[test_id_list], test_paras_mask[test_id_list], test_c_ids[test_id_list], test_c_masks[test_id_list], test_e_ids[test_id_list], test_e_masks[test_id_list], test_c_heads[test_id_list], test_c_tails[test_id_list], test_l_heads[test_id_list], test_l_tails[test_id_list], test_e_heads[test_id_list], test_e_tails[test_id_list], test_questions[test_id_list], test_questions_mask[test_id_list], test_labels[test_id_list]) error+=error_i for ind, gold_label in enumerate(gold_labels_list): test_statistic[(gold_label, preds_i[ind])]+=1 # acc=1.0-error*1.0/len(test_batch_start) acc= (test_statistic.get((1,1),0)+test_statistic.get((0,0),0))*1.0/(test_statistic.get((1,1),0)+test_statistic.get((0,0),0)+test_statistic.get((1,0),0)+test_statistic.get((0,1),0)) if acc> max_acc: max_acc=acc best_test_statistic=test_statistic store_model_to_file(storePath+'Best_Paras_HS_v2_000_subtask_'+str(max_acc), params) print 'Finished storing best params at:', max_acc print 'current average acc:', acc, '\t\tmax acc:', max_acc, '\ttest_statistic:', test_statistic print '\t\t\t\tbest statistic:', best_test_statistic if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(claim, title2sentlist, title2wordlist, word2id): learning_rate = 0.02 n_epochs = 100 emb_size = 300 batch_size = 1 #50 filter_size = [3] sent_len = 40 claim_len = 40 cand_size = 10 hidden_size = [300, 300] max_pred_pick = 5 # model_options = locals().copy() # print("model options", model_options) # print('title2sentlist len', len(title2sentlist)) # print('title2wordlist len', len(title2wordlist)) pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'} seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) claim_idlist, claim_masklist, sent_ins_ids, sent_ins_mask, sent_cand_list = claim_input_2_theano_input( claim, word2id, claim_len, sent_len, cand_size, title2sentlist, title2wordlist) test_claims = np.asarray([claim_idlist], dtype='int32') test_claim_mask = np.asarray([claim_masklist], dtype=theano.config.floatX) test_sents = np.asarray([sent_ins_ids], dtype='int32') test_sent_masks = np.asarray([sent_ins_mask], dtype=theano.config.floatX) vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution # id2word = {y:x for x,y in word2id.items()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable ''' the first block for evidence identification in two classes (support & reject) the second block for textual entailment: given evidence labels, predict the claim labels ''' sents_ids = T.itensor3() #(batch, cand_size, sent_len) sents_mask = T.ftensor3() # sents_labels=T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.fmatrix() # joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len) # joint_sents_mask=T.ftensor3() # # joint_sents_labels=T.imatrix() #(batch, cand_size) # joint_claim_ids = T.imatrix() #(batch, claim_len) # joint_claim_mask = T.fmatrix() # joint_labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') embed_input_sents = init_embeddings[sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim = init_embeddings[claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) "shared parameters" conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) "tasl 1 parameters" task1_att_conv_W, task1_att_conv_b = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_conv_W_context, task1_conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) "task 2 parameters" att_conv_W, att_conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W, att_conv_b, task1_conv_W_context, conv_W_context ] conv_model_sents = Conv_with_Mask( rng, input_tensor3=embed_input_sents, mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_sent_emb = sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) conv_model_claims = Conv_with_Mask( rng, input_tensor3=embed_input_claim, mask_matrix=claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero claim_embeddings = conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1), cand_size, axis=1) ''' attentive conv for task1 ''' task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= embed_input_sents, #batch_size*cand_size, emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=task1_att_conv_W, b=task1_att_conv_b, W_context=task1_conv_W_context, b_context=task1_conv_b_context) task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r concate_claim_sent = T.concatenate([ batch_claim_emb, batch_sent_emb, T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x') ], axis=2) concate_2_matrix = concate_claim_sent.reshape( (batch_size * cand_size, hidden_size[0] * 2 + 1)) "to score each evidence sentence, we use the output of attentiveConv, as well as the output of standard CNN" LR_input = T.concatenate([ concate_2_matrix, task1_attentive_sent_embeddings_l, task1_attentive_sent_embeddings_r ], axis=1) LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2 # LR_input = concate_2_matrix # LR_input_size = hidden_size[0]*2+1 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 1, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para = [U_a] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(LR_input.dot(U_a)) #batch * 12 inter_matrix = score_matrix.reshape((batch_size, cand_size)) # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1) # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size))) ''' maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix) ''' binarize_prob = T.where(inter_matrix > 0.5, 1, 0) #(batch_size, cand_size) sents_labels = inter_matrix * binarize_prob ''' training task2, predict 3 labels ''' # joint_embed_input_sents=init_embeddings[joint_sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM # joint_embed_input_claim=init_embeddings[joint_claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1) # joint_conv_model_sents = Conv_with_Mask(rng, input_tensor3=joint_embed_input_sents, # mask_matrix = joint_sents_mask.reshape((joint_sents_mask.shape[0]*joint_sents_mask.shape[1],joint_sents_mask.shape[2])), # image_shape=(batch_size*cand_size, 1, emb_size, sent_len), # filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero # joint_sent_embeddings=joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size # joint_batch_sent_emb = joint_sent_embeddings.reshape((batch_size, cand_size, hidden_size[0])) # "??? use joint_sents_labels means the evidence labels are not provided by task 1?" # joint_premise_emb = T.sum(joint_batch_sent_emb*joint_sents_labels.dimshuffle(0,1,'x'), axis=1) #(batch, hidden_size) premise_emb = T.sum(batch_sent_emb * sents_labels.dimshuffle(0, 1, 'x'), axis=1) # joint_conv_model_claims = Conv_with_Mask(rng, input_tensor3=joint_embed_input_claim, # mask_matrix = joint_claim_mask, # image_shape=(batch_size, 1, emb_size, claim_len), # filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero # joint_claim_embeddings=joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size premise_hypo_emb = T.concatenate([premise_emb, claim_embeddings], axis=1) #(batch, 2*hidden_size) ''' attentive conv in task2 ''' sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape( (batch_size, cand_size * sent_len, emb_size)) sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle( 0, 2, 1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) sents_dot_2_matrix = T.nnet.softmax( sents_dot.reshape( (batch_size * cand_size * sent_len, cand_size * sent_len))) sents_context = T.batched_dot( sents_dot_2_matrix.reshape( (batch_size, cand_size * sent_len, cand_size * sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) add_sents_context = embed_input_sents + sents_context.reshape( (batch_size * cand_size, sent_len, emb_size) ).dimshuffle( 0, 2, 1 ) #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape( (batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape( (batch_size, cand_size, hidden_size[0])) masked_sents_attconv = attentive_sent_embeddings_l * sents_labels.dimshuffle( 0, 1, 'x') masked_claim_attconv = attentive_sent_embeddings_r * sents_labels.dimshuffle( 0, 1, 'x') fine_max = T.concatenate([ T.max(masked_sents_attconv, axis=1), T.max(masked_claim_attconv, axis=1) ], axis=1) #(batch, 2*hidden) # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) "Logistic Regression layer" joint_LR_input = T.concatenate([premise_hypo_emb, fine_max], axis=1) joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0] joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3) joint_LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class joint_LR_para = [joint_U_a, joint_LR_b] joint_layer_LR = LogisticRegression( rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector # joint_loss=joint_layer_LR.negative_log_likelihood(joint_labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [init_embeddings] + NN_para + LR_para + joint_LR_para print('initialze model parameters...') load_model_from_file( '/home1/w/wenpeng/dataset/FEVER/model_para_0.9936287838053803', params) # train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask,joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids, sents_mask, claim_ids, claim_mask], [inter_matrix, binarize_prob, joint_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_ids,sents_mask, claim_ids,claim_mask], [binarize_prob,joint_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print('... testing') # early-stopping parameters batch_score_vec, batch_binary_vec, pred_i = test_model( test_sents, test_sent_masks, test_claims, test_claim_mask) sorted_indices = np.argsort(batch_score_vec[0])[::-1] #descending order selected_sents = [] for index in sorted_indices: if batch_binary_vec[0][index] == 1: selected_sents.append(sent_cand_list[index]) if len(selected_sents) == 5: break # for i, indicator in enumerate(list(batch_binary_vec[0])): # if indicator == 1: # selected_sents.append(sent_cand_list[i]) return pred_id2label.get( pred_i[0]) + '"<p>"' + '"<br />"'.join(selected_sents) + '"<p/>"'
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size=[5, 5], char_filter_size=3, margin=2.0, max_EM=50.302743615): test_batch_size = batch_size * 10 model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) word2id = {} char2id = {} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id = load_squad_cnn_rank_word_train( word2id, char2id, p_len_limit, q_len_limit, char_len) train_size = len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_word_dev( word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size = len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, rng) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_glove() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, rng) char_embeddings = theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices = T.imatrix() #batch, (start, end) for each sample para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask = T.fmatrix() char_q_mask = T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] common_input_p = embeddings[paragraph.flatten()].reshape( (true_batch_size, true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q = embeddings[questions.flatten()].reshape( (true_batch_size, q_len_limit, emb_size)) char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape( (true_batch_size * true_p_len, char_len, char_emb_size )) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q = char_embeddings[char_questions.flatten()].reshape( (true_batch_size * q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape( (true_batch_size * true_p_len, char_len)) char_q_masks = char_q_mask.reshape( (true_batch_size * q_len_limit, char_len)) conv_W_char, conv_b_char = create_conv_para( rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1 = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size, filter_size[0])) conv_W_2, conv_b_2 = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_1_q, conv_b_1_q = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) NN_para = [ conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char ] input4score = squad_cnn_rank_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, batch_size, p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, para_mask, q_mask, char_p_masks, char_q_masks) #(batch, 4*hidden, p_len_limit) test_input4score = squad_cnn_rank_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, test_batch_size, test_p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, para_mask, q_mask, char_p_masks, char_q_masks) #(batch, 4*hidden, p_len_limit) # gram_size = 5*true_p_len-(0+1+2+3+4) HL_1_para = create_ensemble_para(rng, hidden_size, 6 * hidden_size + char_emb_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a = normalize_matrix(U_a) norm_HL_1_para = normalize_matrix(HL_1_para) norm_HL_2_para = normalize_matrix(HL_2_para) norm_HL_3_para = normalize_matrix(HL_3_para) norm_HL_4_para = normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para(rng, hidden_size, 6 * hidden_size + char_emb_size) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para(rng, 1, hidden_size) end_norm_U_a = normalize_matrix(end_U_a) end_norm_HL_1_para = normalize_matrix(end_HL_1_para) end_norm_HL_2_para = normalize_matrix(end_HL_2_para) end_norm_HL_3_para = normalize_matrix(end_HL_3_para) end_norm_HL_4_para = normalize_matrix(end_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_U_a, batch_size, true_p_len) span_scores = T.nnet.softmax(span_scores_matrix) #(batch, para_len) end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, batch_size, true_p_len) end_span_scores = T.nnet.softmax( end_span_scores_matrix) #(batch, para_len) loss_neg_likelihood = -T.mean( T.log(span_scores[T.arange(batch_size), gold_indices[:, 0]])) end_loss_neg_likelihood = -T.mean( T.log(span_scores[T.arange(batch_size), gold_indices[:, 1]])) #ranking loss start tanh_span_scores_matrix = span_scores #T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor( index_matrix[T.arange(batch_size), gold_indices[:, 0]], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0 - new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega)) #ranking loss END end_tanh_span_scores_matrix = end_span_scores #T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor( end_index_matrix[T.arange(batch_size), gold_indices[:, 1]], 1.0) end_prob_batch_posi = end_tanh_span_scores_matrix[ end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_span_scores_matrix[( 1.0 - end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle( 'x', 0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean( T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega)) loss = loss_neg_likelihood + end_loss_neg_likelihood + loss_rank + end_loss_rank #test test_span_scores_matrix = add_HLs_2_tensor3( test_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_U_a, true_batch_size, true_p_len) #(batch, test_p_len) mask_test_return = T.argmax(test_span_scores_matrix * para_mask, axis=1) #batch end_test_span_scores_matrix = add_HLs_2_tensor3( test_input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size, true_p_len) #(batch, test_p_len) end_mask_test_return = T.argmax(end_test_span_scores_matrix * para_mask, axis=1) #batch params = ( [embeddings, char_embeddings] + NN_para + [U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] + [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para]) L2_reg = L2norm_paraList([ embeddings, char_embeddings, conv_W_1, conv_W_2, conv_W_1_q, conv_W_2_q, conv_W_char, U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para ]) #L2_reg = L2norm_paraList(params) cost = loss + L2_weight * L2_reg accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function( [ paragraph, questions, gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask, true_p_len ], cost, updates=updates, on_unused_input='ignore') test_model = theano.function([ paragraph, questions, para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len ], [mask_test_return, end_mask_test_return], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / test_batch_size # remain_test=test_size%batch_size test_batch_start = list(numpy.arange(n_test_batches) * test_batch_size) + [test_size - test_batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.Random(4).shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_ids[para_id:para_id + batch_size] cost_i += train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() pred_dict = {} q_amount = 0 p1 = 0 for test_para_id in test_batch_start: batch_predict_ids, batch_predict_end_ids = test_model( test_para_list[test_para_id:test_para_id + test_batch_size], test_Q_list[test_para_id:test_para_id + test_batch_size], test_para_mask[test_para_id:test_para_id + test_batch_size], test_Q_mask[test_para_id:test_para_id + test_batch_size], test_para_char_list[test_para_id:test_para_id + test_batch_size], test_Q_char_list[test_para_id:test_para_id + test_batch_size], test_para_char_mask[test_para_id:test_para_id + test_batch_size], test_Q_char_mask[test_para_id:test_para_id + test_batch_size], test_p_len_limit) test_para_wordlist_batch = test_para_wordlist_list[ test_para_id:test_para_id + test_batch_size] # test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] # q_amount+=test_batch_size q_ids_batch = q_idlist[test_para_id:test_para_id + test_batch_size] q_amount += test_batch_size for q in range(test_batch_size): #for each question # pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q]) start = batch_predict_ids[q] end = batch_predict_end_ids[q] if end < start: start, end = end, start pred_ans = ' '.join( test_para_wordlist_batch[q][start:end + 1]) q_id = q_ids_batch[q] pred_dict[q_id] = pred_ans with codecs.open(rootPath + 'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json', rootPath + 'predictions.txt') if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=20, cand_size=10,hidden_size=[300,300], max_pred_pick=5): model_options = locals().copy() print "model options", model_options seed=1234 np.random.seed(seed) rng = np.random.RandomState(seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, _, word2id = load_fever_train(sent_len, claim_len, cand_size) test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names,test_ground_names,_, word2id = load_fever_dev(sent_len, claim_len, cand_size, word2id) train_sents=np.asarray(train_sents, dtype='int32') # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') test_sents=np.asarray(test_sents, dtype='int32') train_sent_masks=np.asarray(train_sent_masks, dtype=theano.config.floatX) # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_sent_masks=np.asarray(test_sent_masks, dtype=theano.config.floatX) train_sent_labels=np.asarray(train_sent_labels, dtype='int32') # dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') # test_sent_labels=np.asarray(test_sent_labels, dtype='int32') train_claims=np.asarray(train_claims, dtype='int32') # dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') test_claims=np.asarray(test_claims, dtype='int32') train_claim_mask=np.asarray(train_claim_mask, dtype=theano.config.floatX) # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_claim_mask=np.asarray(test_claim_mask, dtype=theano.config.floatX) # train_labels_store=np.asarray(all_labels[0], dtype='int32') # dev_labels_store=np.asarray(all_labels[1], dtype='int32') # test_labels_store=np.asarray(all_labels[2], dtype='int32') train_size=len(train_claims) # dev_size=len(dev_labels_store) test_size=len(test_claims) print 'train size: ', train_size, ' test size: ', test_size vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids=T.itensor3() #(batch, cand_size, sent_len) sents_mask=T.ftensor3() sents_labels=T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.imatrix() # labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' embed_input_sents=init_embeddings[sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim=init_embeddings[claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1) conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) # conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) NN_para = [conv_W, conv_b] conv_model_sents = Conv_with_Mask(rng, input_tensor3=embed_input_sents, mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])), image_shape=(batch_size*cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings=conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_sent_emb = sent_embeddings.reshape((batch_size, cand_size, hidden_size[0])) conv_model_claims = Conv_with_Mask(rng, input_tensor3=embed_input_claim, mask_matrix = claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero claim_embeddings=conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0,'x', 1), cand_size, axis=1) concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb ], axis=2) concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2)) LR_input = concate_2_matrix#T.concatenate([sent_embeddings,sent_embeddings2], axis=1) LR_input_size = hidden_size[0]*2 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para(rng, 1, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(concate_2_matrix.dot(U_a)) #batch * 12 inter_matrix = score_matrix.reshape((batch_size, cand_size)) # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1) # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size))) ''' maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix) ''' prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix) loss = -T.mean(T.log(prob_pos)) # # "Logistic Regression layer" # LR_input = T.concatenate([attentive_sent_embeddings_l,attentive_sent_embeddings_r,attentive_sent_embeddings_l+attentive_sent_embeddings_r,attentive_sent_embeddings_l*attentive_sent_embeddings_r],axis=1) # LR_input_size=4*hidden_size[0] # # U_a = create_ensemble_para(rng, 3, LR_input_size) # (input_size, 3) # LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class # LR_para=[U_a, LR_b] # # layer_LR=LogisticRegression(rng, input=normalize_matrix_col_wise(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector # loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' binarize_prob = T.where( inter_matrix > 0.5, 1, 0) #(batch_size, cand_size params = [init_embeddings]+NN_para+LR_para cost=loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost,params, learning_rate) train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_ids,sents_mask,claim_ids,claim_mask], inter_matrix, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_train_batches=train_size/batch_size train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches=test_size/batch_size test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_acc_dev=0.0 max_test_f1=0.0 cost_i=0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu=0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_indices[batch_id:batch_id+batch_size] ''' train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask sents_ids,sents_mask,sents_labels,claim_ids,claim_mask ''' cost_i+= train_model( train_sents[train_id_batch], train_sent_masks[train_id_batch], train_sent_labels[train_id_batch], train_claims[train_id_batch], train_claim_mask[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0): if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() ''' test test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, sents_ids,sents_mask,claim_ids,claim_mask ''' f1_sum=0.0 for test_batch_id in test_batch_start: # for each test batch batch_prob=test_model( test_sents[test_batch_id:test_batch_id+batch_size], test_sent_masks[test_batch_id:test_batch_id+batch_size], test_claims[test_batch_id:test_batch_id+batch_size], test_claim_mask[test_batch_id:test_batch_id+batch_size]) batch_sent_labels = test_sent_labels[test_batch_id:test_batch_id+batch_size] batch_sent_names = test_sent_names[test_batch_id:test_batch_id+batch_size] batch_ground_names = test_ground_names[test_batch_id:test_batch_id+batch_size] for i in range(batch_size): pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped=[(batch_prob[i,k],batch_sent_labels[i][k],batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) # print 'sorted_zip:', sorted_zip # exit(0) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary ''' if triple[0] >0.5: pred_sent_names.append(batch_sent_names[i][j]) if len(pred_sent_names) == max_pred_pick: break f1_i = compute_f1_two_list_names(pred_sent_names, gold_sent_names) f1_sum+=f1_i test_f1=f1_sum/(len(test_batch_start)*batch_size) if test_f1 > max_test_f1: max_test_f1=test_f1 print '\t\tcurrent test_f1:', test_f1,' ; ','\t\t\t\t\tmax_test_f1:', max_test_f1 print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0, decay=0.95): maxSentLength=max_s_length+2*(window_width-1) maxDocLength=max_d_length+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/'; rng = numpy.random.RandomState(23455) train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DQAAAA(rootPath+'vocab_DQAAAA.txt', rootPath+'mc500.train.tsv_standardlized.txt_DQAAAA.txt', rootPath+'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test [train_data_D, train_data_Q, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4, train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4, train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4]=train_data [test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4, test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4]=test_data n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DQAAAA_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A1= T.lvector() index_A2= T.lvector() index_A3= T.lvector() index_A4= T.lvector() # y = T.lvector() len_D=T.lscalar() len_D_s=T.lvector() len_Q=T.lscalar() len_A1=T.lscalar() len_A2=T.lscalar() len_A3=T.lscalar() len_A4=T.lscalar() left_D=T.lscalar() left_D_s=T.lvector() left_Q=T.lscalar() left_A1=T.lscalar() left_A2=T.lscalar() left_A3=T.lscalar() left_A4=T.lscalar() right_D=T.lscalar() right_D_s=T.lvector() right_Q=T.lscalar() right_A1=T.lscalar() right_A2=T.lscalar() right_A3=T.lscalar() right_A4=T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words=(emb_size,window_width) filter_sents=(nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1), 'layer0_D_input')#.dimshuffle(0, 'x', 1, 2) layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_Q_input')#.dimshuffle(0, 'x', 1, 2) layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_A1_input')#.dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2) layer0_A4_input = embeddings[index_A4.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2) U, W, b, Ub,Wb,bb=create_Bi_GRU_para(rng, emb_size, nkerns[0]) layer0_para=[U, W, b, Ub,Wb,bb] # conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) # layer2_para=[conv2_W, conv2_b] # high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # highW_para=[high_W, high_b] #load_model(params) layer0_D = Bi_GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D,:,:], lefts=left_D_s[left_D:-right_D], rights=right_D_s[left_D:-right_D], hidden_dim=nkerns[0], U=U,W=W,b=b, Ub=Ub,Wb=Wb,bb=bb) layer0_Q = Bi_GRU_Matrix_Input(X=layer0_Q_input[:,left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,U_b=Ub,W_b=Wb,b_b=bb, bptt_truncate=-1) layer0_A1 = Bi_GRU_Matrix_Input(X=layer0_A1_input[:,left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,U_b=Ub,W_b=Wb,b_b=bb, bptt_truncate=-1) layer0_A2 = Bi_GRU_Matrix_Input(X=layer0_A2_input[:,left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,U_b=Ub,W_b=Wb,b_b=bb, bptt_truncate=-1) layer0_A3 = Bi_GRU_Matrix_Input(X=layer0_A3_input[:,left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,U_b=Ub,W_b=Wb,b_b=bb, bptt_truncate=-1) layer0_A4 = Bi_GRU_Matrix_Input(X=layer0_A4_input[:,left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,U_b=Ub,W_b=Wb,b_b=bb, bptt_truncate=-1) layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output') # hidden*2 layer0_Q_output=debug_print(layer0_Q.output_vector_last, 'layer0_Q.output') # hidden*4 layer0_A1_output=debug_print(layer0_A1.output_vector_last, 'layer0_A1.output') layer0_A2_output=debug_print(layer0_A2.output_vector_last, 'layer0_A2.output') layer0_A3_output=debug_print(layer0_A3.output_vector_last, 'layer0_A3.output') layer0_A4_output=debug_print(layer0_A4.output_vector_last, 'layer0_A4.output') #before reasoning, do a GRU for doc: d U_d, W_d, b_d, U_db, W_db, b_db=create_Bi_GRU_para(rng, nkerns[0]*2, nkerns[0]*2) layer_d_para=[U_d, W_d, b_d, U_db, W_db, b_db] layer_D_GRU = Bi_GRU_Matrix_Input(X=layer0_D_output, word_dim=nkerns[0]*2, hidden_dim=nkerns[0]*2,U=U_d,W=W_d,b=b_d,U_b=U_db,W_b=W_db,b_b=b_db, bptt_truncate=-1) #Reasoning Layer 1 repeat_Q=debug_print(T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0],1)), maxDocLength, axis=1)[:,:layer_D_GRU.output_matrix.shape[1]], 'repeat_Q') input_DNN=debug_print(T.concatenate([layer_D_GRU.output_matrix,repeat_Q], axis=0).transpose(), 'input_DNN')#each row is an example output_DNN1=HiddenLayer(rng, input=input_DNN, n_in=nkerns[0]*8, n_out=nkerns[0]) attention_W=create_ensemble_para(rng, nkerns[0], 1) attention_weights=T.nnet.softmax(T.dot(attention_W,output_DNN1.output.transpose())) repeat_attentions=T.repeat(attention_weights, layer_D_GRU.output_matrix.shape[0], axis=0) doc_r=T.sum(layer_D_GRU.output_matrix*repeat_attentions, axis=1) combine_DQ=T.concatenate([doc_r, layer0_Q_output], axis=0) # dim: hidden*6 output_DNN2=HiddenLayer(rng, input=combine_DQ, n_in=nkerns[0]*8, n_out=nkerns[0]*4) # DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out') # U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_pooling_para=[U_p, W_p, b_p] # pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1) # translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1') # # # #before reasoning, do a GRU for doc: d2 # U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_d2_para=[U_d2, W_d2, b_d2] # layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1) # #Reasoning Layer 2 # repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1') # input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example # output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0]) # output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0]) # # DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2') # U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_pooling_para2=[U_p2, W_p2, b_p2] # pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1) translated_Q2=debug_print(output_DNN2.output, 'translated_Q2') QA1=T.concatenate([translated_Q2, layer0_A1_output], axis=0)#dim: hidden*5 QA2=T.concatenate([translated_Q2, layer0_A2_output], axis=0) QA3=T.concatenate([translated_Q2, layer0_A3_output], axis=0) QA4=T.concatenate([translated_Q2, layer0_A4_output], axis=0) W_HL,b_HL=create_HiddenLayer_para(rng, n_in=nkerns[0]*8, n_out=1) match_params=[W_HL,b_HL] QA1_match=HiddenLayer(rng, input=QA1, n_in=nkerns[0]*8, n_out=1, W=W_HL, b=b_HL) QA2_match=HiddenLayer(rng, input=QA2, n_in=nkerns[0]*8, n_out=1, W=W_HL, b=b_HL) QA3_match=HiddenLayer(rng, input=QA3, n_in=nkerns[0]*8, n_out=1, W=W_HL, b=b_HL) QA4_match=HiddenLayer(rng, input=QA4, n_in=nkerns[0]*8, n_out=1, W=W_HL, b=b_HL) # simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1') # simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4') simi_overall_level1=debug_print(QA1_match.output[0], 'simi_overall_level1') simi_overall_level2=debug_print(QA2_match.output[0], 'simi_overall_level2') simi_overall_level3=debug_print(QA3_match.output[0], 'simi_overall_level3') simi_overall_level4=debug_print(QA4_match.output[0], 'simi_overall_level4') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) #only use overall_simi cost=T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level3-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level4-simi_overall_level1) # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) posi_simi=simi_overall_level1 nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4]) # #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_1 # nega_simi=T.max([simi_2, simi_3, simi_4]) L2_reg =debug_print((U**2).sum()+(W**2).sum() +(Ub**2).sum()+(Wb**2).sum() +(output_DNN1.W**2).sum() +(output_DNN2.W**2).sum() +(U_d**2).sum()+(W_d**2).sum() +(U_db**2).sum()+(W_db**2).sum() +(W_HL**2).sum() +(attention_W**2).sum() , 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost=debug_print(cost+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [cost, posi_simi, nega_simi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index], right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') params = layer0_para+output_DNN1.params+output_DNN2.params+match_params+layer_d_para+[attention_W] # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6))) # updates.append((acc_i, acc)) def AdaDelta_updates(parameters,gradients,rho,eps): # create variables to store intermediate updates gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] # calculates the new "average" delta for the next iteration gradients_sq_new = [ rho*g_sq + (1-rho)*(g**2) for g_sq,g in zip(gradients_sq,gradients) ] # calculates the step in direction. The square root is an approximation to getting the RMS for the average value deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ] # calculates the new "average" deltas for the next step. deltas_sq_new = [ rho*d_sq + (1-rho)*(d**2) for d_sq,d in zip(deltas_sq,deltas) ] # Prepare it as a list f gradient_sq_updates = zip(gradients_sq,gradients_sq_new) deltas_sq_updates = zip(deltas_sq,deltas_sq_new) parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ] return gradient_sq_updates + deltas_sq_updates + parameters_updates updates=AdaDelta_updates(params, grads, decay, 1e-6) train_model = theano.function([index], [cost, posi_simi, nega_simi], updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost, posi_simi, nega_simi], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 # shuffle(train_batch_start)#shuffle training data corr_train=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) ) sys.stdout.flush() minibatch_index=minibatch_index+1 cost_average, posi_simi, nega_simi= train_model(batch_start) if posi_simi>nega_simi: corr_train+=1 if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train*100.0/train_size) if iter % validation_frequency == 0: corr_test=0 for i in test_batch_start: cost, posi_simi, nega_simi=test_model(i) if posi_simi>nega_simi: corr_test+=1 #write_file.close() #test_score = numpy.mean(test_losses) test_acc=corr_test*1.0/test_size #test_acc=1-test_score print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches,test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better=False if test_acc > max_acc: max_acc=test_acc best_epoch=epoch find_better=True print '\t\t\ttest_acc:', test_acc, 'max:', max_acc,'(at',best_epoch,')' if find_better==True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0, decay=0.95): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/' rng = numpy.random.RandomState(23455) train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DQAAAA( rootPath + 'vocab_DQAAAA.txt', rootPath + 'mc500.train.tsv_standardlized.txt_DQAAAA.txt', rootPath + 'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test [ train_data_D, train_data_Q, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, train_Length_D, train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4, train_leftPad_D, train_leftPad_D_s, train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4, train_rightPad_D, train_rightPad_D_s, train_rightPad_Q, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4 ] = train_data [ test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, test_Length_D, test_Length_D_s, test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D, test_leftPad_D_s, test_leftPad_Q, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4, test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4 ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init( rand_values, rootPath + 'vocab_DQAAAA_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A1 = T.lvector() index_A2 = T.lvector() index_A3 = T.lvector() index_A4 = T.lvector() # y = T.lvector() len_D = T.lscalar() len_D_s = T.lvector() len_Q = T.lscalar() len_A1 = T.lscalar() len_A2 = T.lscalar() len_A3 = T.lscalar() len_A4 = T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() left_Q = T.lscalar() left_A1 = T.lscalar() left_A2 = T.lscalar() left_A3 = T.lscalar() left_A4 = T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() right_Q = T.lscalar() right_A1 = T.lscalar() right_A2 = T.lscalar() right_A3 = T.lscalar() right_A4 = T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1), 'layer0_D_input') #.dimshuffle(0, 'x', 1, 2) layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape( (maxSentLength, emb_size)).transpose(), 'layer0_Q_input') #.dimshuffle(0, 'x', 1, 2) layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape( (maxSentLength, emb_size)).transpose(), 'layer0_A1_input') #.dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape( (maxSentLength, emb_size)).transpose() #.dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape( (maxSentLength, emb_size)).transpose() #.dimshuffle(0, 'x', 1, 2) layer0_A4_input = embeddings[index_A4.flatten()].reshape( (maxSentLength, emb_size)).transpose() #.dimshuffle(0, 'x', 1, 2) U, W, b, Ub, Wb, bb = create_Bi_GRU_para(rng, emb_size, nkerns[0]) layer0_para = [U, W, b, Ub, Wb, bb] # conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) # layer2_para=[conv2_W, conv2_b] # high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # highW_para=[high_W, high_b] #load_model(params) layer0_D = Bi_GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D, :, :], lefts=left_D_s[left_D:-right_D], rights=right_D_s[left_D:-right_D], hidden_dim=nkerns[0], U=U, W=W, b=b, Ub=Ub, Wb=Wb, bb=bb) layer0_Q = Bi_GRU_Matrix_Input(X=layer0_Q_input[:, left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A1 = Bi_GRU_Matrix_Input(X=layer0_A1_input[:, left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A2 = Bi_GRU_Matrix_Input(X=layer0_A2_input[:, left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A3 = Bi_GRU_Matrix_Input(X=layer0_A3_input[:, left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A4 = Bi_GRU_Matrix_Input(X=layer0_A4_input[:, left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') # hidden*2 layer0_Q_output = debug_print(layer0_Q.output_vector_last, 'layer0_Q.output') # hidden*4 layer0_A1_output = debug_print(layer0_A1.output_vector_last, 'layer0_A1.output') layer0_A2_output = debug_print(layer0_A2.output_vector_last, 'layer0_A2.output') layer0_A3_output = debug_print(layer0_A3.output_vector_last, 'layer0_A3.output') layer0_A4_output = debug_print(layer0_A4.output_vector_last, 'layer0_A4.output') #before reasoning, do a GRU for doc: d U_d, W_d, b_d, U_db, W_db, b_db = create_Bi_GRU_para( rng, nkerns[0] * 2, nkerns[0] * 2) layer_d_para = [U_d, W_d, b_d, U_db, W_db, b_db] layer_D_GRU = Bi_GRU_Matrix_Input(X=layer0_D_output, word_dim=nkerns[0] * 2, hidden_dim=nkerns[0] * 2, U=U_d, W=W_d, b=b_d, U_b=U_db, W_b=W_db, b_b=b_db, bptt_truncate=-1) #Reasoning Layer 1 repeat_Q = debug_print( T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0], 1)), maxDocLength, axis=1)[:, :layer_D_GRU.output_matrix.shape[1]], 'repeat_Q') input_DNN = debug_print( T.concatenate([layer_D_GRU.output_matrix, repeat_Q], axis=0).transpose(), 'input_DNN') #each row is an example output_DNN1 = HiddenLayer(rng, input=input_DNN, n_in=nkerns[0] * 8, n_out=nkerns[0]) attention_W = create_ensemble_para(rng, nkerns[0], 1) attention_weights = T.nnet.softmax( T.dot(attention_W, output_DNN1.output.transpose())) repeat_attentions = T.repeat(attention_weights, layer_D_GRU.output_matrix.shape[0], axis=0) doc_r = T.sum(layer_D_GRU.output_matrix * repeat_attentions, axis=1) combine_DQ = T.concatenate([doc_r, layer0_Q_output], axis=0) # dim: hidden*6 output_DNN2 = HiddenLayer(rng, input=combine_DQ, n_in=nkerns[0] * 8, n_out=nkerns[0] * 4) # DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out') # U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_pooling_para=[U_p, W_p, b_p] # pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1) # translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1') # # # #before reasoning, do a GRU for doc: d2 # U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_d2_para=[U_d2, W_d2, b_d2] # layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1) # #Reasoning Layer 2 # repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1') # input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example # output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0]) # output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0]) # # DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2') # U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_pooling_para2=[U_p2, W_p2, b_p2] # pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1) translated_Q2 = debug_print(output_DNN2.output, 'translated_Q2') QA1 = T.concatenate([translated_Q2, layer0_A1_output], axis=0) #dim: hidden*5 QA2 = T.concatenate([translated_Q2, layer0_A2_output], axis=0) QA3 = T.concatenate([translated_Q2, layer0_A3_output], axis=0) QA4 = T.concatenate([translated_Q2, layer0_A4_output], axis=0) W_HL, b_HL = create_HiddenLayer_para(rng, n_in=nkerns[0] * 8, n_out=1) match_params = [W_HL, b_HL] QA1_match = HiddenLayer(rng, input=QA1, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) QA2_match = HiddenLayer(rng, input=QA2, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) QA3_match = HiddenLayer(rng, input=QA3, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) QA4_match = HiddenLayer(rng, input=QA4, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) # simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1') # simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4') simi_overall_level1 = debug_print(QA1_match.output[0], 'simi_overall_level1') simi_overall_level2 = debug_print(QA2_match.output[0], 'simi_overall_level2') simi_overall_level3 = debug_print(QA3_match.output[0], 'simi_overall_level3') simi_overall_level4 = debug_print(QA4_match.output[0], 'simi_overall_level4') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) #only use overall_simi cost = T.maximum( 0.0, margin + simi_overall_level2 - simi_overall_level1) + T.maximum( 0.0, margin + simi_overall_level3 - simi_overall_level1) + T.maximum( 0.0, margin + simi_overall_level4 - simi_overall_level1) # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) posi_simi = simi_overall_level1 nega_simi = T.max( [simi_overall_level2, simi_overall_level3, simi_overall_level4]) # #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_1 # nega_simi=T.max([simi_2, simi_3, simi_4]) L2_reg = debug_print( (U**2).sum() + (W**2).sum() + (Ub**2).sum() + (Wb**2).sum() + (output_DNN1.W**2).sum() + (output_DNN2.W**2).sum() + (U_d**2).sum() + (W_d**2).sum() + (U_db**2).sum() + (W_db**2).sum() + (W_HL**2).sum() + (attention_W**2).sum(), 'L2_reg' ) #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost = debug_print(cost + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index], right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') params = layer0_para + output_DNN1.params + output_DNN2.params + match_params + layer_d_para + [ attention_W ] # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6))) # updates.append((acc_i, acc)) def AdaDelta_updates(parameters, gradients, rho, eps): # create variables to store intermediate updates gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] # calculates the new "average" delta for the next iteration gradients_sq_new = [ rho * g_sq + (1 - rho) * (g**2) for g_sq, g in zip(gradients_sq, gradients) ] # calculates the step in direction. The square root is an approximation to getting the RMS for the average value deltas = [ (T.sqrt(d_sq + eps) / T.sqrt(g_sq + eps)) * grad for d_sq, g_sq, grad in zip(deltas_sq, gradients_sq_new, gradients) ] # calculates the new "average" deltas for the next step. deltas_sq_new = [ rho * d_sq + (1 - rho) * (d**2) for d_sq, d in zip(deltas_sq, deltas) ] # Prepare it as a list f gradient_sq_updates = zip(gradients_sq, gradients_sq_new) deltas_sq_updates = zip(deltas_sq, deltas_sq_new) parameters_updates = [(p, p - d) for p, d in zip(parameters, deltas)] return gradient_sq_updates + deltas_sq_updates + parameters_updates updates = AdaDelta_updates(params, grads, decay, 1e-6) train_model = theano.function( [index], [cost, posi_simi, nega_simi], updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 # shuffle(train_batch_start)#shuffle training data corr_train = 0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 sys.stdout.write("Training :[%6f] %% complete!\r" % ((iter % train_size) * 100.0 / train_size)) sys.stdout.flush() minibatch_index = minibatch_index + 1 cost_average, posi_simi, nega_simi = train_model(batch_start) if posi_simi > nega_simi: corr_train += 1 if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + 'corr rate:' + str( corr_train * 100.0 / train_size) if iter % validation_frequency == 0: corr_test = 0 for i in test_batch_start: cost, posi_simi, nega_simi = test_model(i) if posi_simi > nega_simi: corr_test += 1 #write_file.close() #test_score = numpy.mean(test_losses) test_acc = corr_test * 1.0 / test_size #test_acc=1-test_score print( ('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better = False if test_acc > max_acc: max_acc = test_acc best_epoch = epoch find_better = True print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')' if find_better == True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.5, n_epochs=2000, batch_size=500, emb_size=300, hidden_size=300, L2_weight=0.0001, para_len_limit=700, q_len_limit=40): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train( para_len_limit, q_len_limit) train_size = len(train_para_list) if train_size != len(train_Q_list) or train_size != len( train_label_list) or train_size != len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test( word2id, para_len_limit, q_len_limit) test_size = len(test_para_list) if test_size != len(test_Q_list) or test_size != len( test_mask) or test_size != len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) id2word = {y: x for x, y in overall_word2id.iteritems()} word2vec = load_word2vec() rand_values = random_value_normal((overall_vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') extraF = T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) paragraph_input = embeddings[paragraph.flatten()].reshape( (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose( (0, 2, 1)) # (batch_size, emb_size, maxparalen) # # # BdGRU(rng, str(0), shape, X, mask, is_train = 1, batch_size = 1, p = 0.5) # U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size) paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b] paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size, U=U1, W=W1, b=b1, Ub=U1_b, Wb=W1_b, bb=b1_b) para_reps = paragraph_model.output_tensor #(batch, emb, para_len) Qs_emb = embeddings[questions.flatten()].reshape( (questions.shape[0], questions.shape[1], emb_size)).transpose( (0, 2, 1)) #(#questions, emb_size, maxsenlength) UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size) Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ, W=WQ, b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) questions_reps = questions_model.output_sent_rep_maxpooling.reshape( (batch_size, 1, hidden_size)) #(batch, 2*out_size) #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #attention distributions W_a1 = create_ensemble_para( rng, hidden_size, hidden_size) # init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size + 3) # 3 extra features norm_W_a1 = normalize_matrix(W_a1) norm_W_a2 = normalize_matrix(W_a2) norm_U_a = normalize_matrix(U_a) LR_b = theano.shared( value=numpy.zeros((2, ), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras = [W_a1, W_a2, U_a, LR_b] transformed_para_reps = T.tanh( T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2)) transformed_q_reps = T.tanh(T.dot(questions_reps, norm_W_a1)) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both = 0.5 * (transformed_para_reps + transformed_q_reps) prior_att = T.concatenate([add_both, normalize_matrix(extraF)], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices = para_mask.flatten().nonzero()[0] layer3 = LogisticRegression(rng, input=prior_att.reshape( (batch_size * prior_att.shape[1], hidden_size + 3)), n_in=hidden_size + 3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.mean( T.log(layer3.p_y_given_x) [valid_indices, labels.flatten()[valid_indices]]) #[T.arange(y.shape[0]), y]) distributions = layer3.p_y_given_x[:, -1].reshape( (batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) masked_dis = distributions * para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = [embeddings] + paragraph_para + Q_para + attention_paras L2_reg = L2norm_paraList( [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost = error #+L2_weight*L2_reg accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [paragraph, questions, labels, para_mask, q_mask, extraF], error, updates=updates, on_unused_input='ignore') test_model = theano.function( [paragraph, questions, para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size # remain_test=test_size%batch_size test_batch_start = list( numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_exact_acc = 0.0 cost_i = 0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #shuffle(train_batch_start) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i += train_model( np.asarray(train_para_list[para_id:para_id + batch_size], dtype='int32'), np.asarray(train_Q_list[para_id:para_id + batch_size], dtype='int32'), np.asarray(train_label_list[para_id:para_id + batch_size], dtype='int32'), np.asarray(train_para_mask[para_id:para_id + batch_size], dtype=theano.config.floatX), np.asarray(train_mask[para_id:para_id + batch_size], dtype=theano.config.floatX), np.asarray(train_feature_matrixlist[para_id:para_id + batch_size], dtype=theano.config.floatX)) #print iter if iter % 10 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() exact_match = 0.0 q_amount = 0 for test_para_id in test_batch_start: distribution_matrix = test_model( np.asarray(test_para_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray( test_feature_matrixlist[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list = test_text_list[ test_para_id:test_para_id + batch_size] para_gold_ansset_list = q_ansSet_list[ test_para_id:test_para_id + batch_size] paralist_extra_features = test_feature_matrixlist[ test_para_id:test_para_id + batch_size] sub_para_mask = test_para_mask[test_para_id:test_para_id + batch_size] para_len = len(test_para_wordlist_list[0]) if para_len != len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len( distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount += batch_size # print q_size # print test_para_word_list for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans = extract_ansList_attentionList( test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q]) q_gold_ans_set = para_gold_ansset_list[q] F1 = MacroF1(pred_ans, q_gold_ans_set) exact_match += F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) exact_acc = exact_match / q_amount if exact_acc > max_exact_acc: max_exact_acc = exact_acc print 'current average F1:', exact_acc, '\t\tmax F1:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.005, n_epochs=2000, batch_size=300, test_batch_size=400, emb_size=50, hidden_size=300, HL_hidden_size=200, L2_weight=0.0001, train_size=None, test_size=None, batch_size_pred=400, trichar_len=15,char_emb_size=50, para_len=101, question_len=20, c_len=1, model_type='train'): model_options = locals().copy() print "model options", model_options rootPath='/mounts/Users/cisintern/hs/l/workhs/yin/20170320/'; storePath='/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = np.random.RandomState(23455) word2id={} trichar2id={} word2id['UNK']=0 # use it to pad #word2id, trichar2id, questions,questions_mask,paras,paras_mask,labels, isInQ_para, paras_shape, questions_shape, types, types_shape,question_trichar_ids,question_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks word2id, trichar2id,train_questions,train_questions_mask,train_paras,train_paras_mask,train_labels, train_islabels, train_paras_shape, train_questions_shape, train_types, train_types_shape,train_question_trichar_ids,train_question_trichar_masks,train_para_trichar_ids,train_para_trichar_masks,train_type_trichar_ids,train_type_trichar_masks=load_SQUAD_hinrich_v4(train_size, para_len, question_len, trichar_len, word2id,trichar2id, rootPath+'trn20170320.txt') word2id, trichar2id,test_questions,test_questions_mask,test_paras,test_paras_mask,test_labels, test_islabels, test_paras_shape, test_questions_shape, test_types, test_types_shape,test_question_trichar_ids,test_question_trichar_masks,test_para_trichar_ids,test_para_trichar_masks,test_type_trichar_ids,test_type_trichar_masks=load_SQUAD_hinrich_v4(test_size, para_len, question_len, trichar_len,word2id, trichar2id, rootPath+'dev.big.20170320.txt') word2id, trichar2id,test_questions,test_questions_mask,test_paras,test_paras_mask,test_labels, test_islabels, test_paras_shape, test_questions_shape, test_types, test_types_shape,test_question_trichar_ids,test_question_trichar_masks,test_para_trichar_ids,test_para_trichar_masks,test_type_trichar_ids,test_type_trichar_masks=load_SQUAD_hinrich_v4(test_size, para_len, question_len, trichar_len,word2id, trichar2id, rootPath+'dev20170320.txt') print 'word2id size for bigger dataset:', len(word2id), 'trichar size:', len(trichar2id) train_size=len(train_questions) test_size = len(test_questions) #50010# train_questions = np.asarray(train_questions, dtype='int32') train_questions_shape = np.asarray(train_questions_shape, dtype='int32') train_questions_mask = np.asarray(train_questions_mask, dtype=theano.config.floatX) train_paras = np.asarray(train_paras, dtype='int32') train_paras_shape = np.asarray(train_paras_shape, dtype='int32') train_paras_mask = np.asarray(train_paras_mask, dtype=theano.config.floatX) train_types = np.asarray(train_types, dtype='int32') train_types_shape = np.asarray(train_types_shape, dtype='int32') # train_c_ids = np.asarray(train_c_ids, dtype='int32') # train_c_ids_shape = np.asarray(train_c_ids_shape, dtype='int32') # train_c_masks = np.asarray(train_c_masks, dtype=theano.config.floatX) train_islabels = np.asarray(train_islabels, dtype=theano.config.floatX) # train_c_heads = np.asarray(train_c_heads, dtype='int32') # train_c_tails = np.asarray(train_c_tails, dtype='int32') train_labels = np.asarray(train_labels, dtype='int32') #train_question_trichar_ids,train_question_trichar_masks,train_para_trichar_ids,train_para_trichar_masks,train_type_trichar_ids,train_type_trichar_masks train_question_trichar_ids = np.asarray(train_question_trichar_ids, dtype='int32') train_question_trichar_masks = np.asarray(train_question_trichar_masks, dtype=theano.config.floatX) train_para_trichar_ids = np.asarray(train_para_trichar_ids, dtype='int32') train_para_trichar_masks = np.asarray(train_para_trichar_masks, dtype=theano.config.floatX) train_type_trichar_ids = np.asarray(train_type_trichar_ids, dtype='int32') train_type_trichar_masks = np.asarray(train_type_trichar_masks, dtype=theano.config.floatX) test_questions = np.asarray(test_questions, dtype='int32') test_questions_shape = np.asarray(test_questions_shape, dtype='int32') test_questions_mask = np.asarray(test_questions_mask, dtype=theano.config.floatX) test_paras = np.asarray(test_paras, dtype='int32') test_paras_shape = np.asarray(test_paras_shape, dtype='int32') test_paras_mask = np.asarray(test_paras_mask, dtype=theano.config.floatX) test_types = np.asarray(test_types, dtype='int32') test_types_shape = np.asarray(test_types_shape, dtype='int32') # test_c_ids = np.asarray(test_c_ids, dtype='int32') # test_c_ids_shape = np.asarray(test_c_ids_shape, dtype='int32') # test_c_masks = np.asarray(test_c_masks, dtype=theano.config.floatX) test_islabels = np.asarray(test_islabels, dtype=theano.config.floatX) # test_c_heads = np.asarray(test_c_heads, dtype='int32') # test_c_tails = np.asarray(test_c_tails, dtype='int32') test_labels = np.asarray(test_labels, dtype='int32') test_question_trichar_ids = np.asarray(test_question_trichar_ids, dtype='int32') test_question_trichar_masks = np.asarray(test_question_trichar_masks, dtype=theano.config.floatX) test_para_trichar_ids = np.asarray(test_para_trichar_ids, dtype='int32') test_para_trichar_masks = np.asarray(test_para_trichar_masks, dtype=theano.config.floatX) test_type_trichar_ids = np.asarray(test_type_trichar_ids, dtype='int32') test_type_trichar_masks = np.asarray(test_type_trichar_masks, dtype=theano.config.floatX) overall_vocab_size=len(word2id) print 'train size:', train_size, 'test size:', test_size, 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size, emb_size), theano.config.floatX, rng) rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) overall_trichar_size = len(trichar2id) char_rand_values=random_value_normal((overall_trichar_size, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) para=T.imatrix() #(2*batch, len) para_shape = T.imatrix() para_mask=T.fmatrix() #(2*batch, len) q=T.imatrix() #(2*batch, len_q) q_shape = T.imatrix() q_mask=T.fmatrix() #(2*batch, len_q) islabels = T.fmatrix() labels=T.ivector() #batch types=T.imatrix() types_shape=T.imatrix() q_trichar_ids = T.imatrix() q_trichar_masks =T.fmatrix() para_trichar_ids = T.imatrix() para_trichar_masks =T.fmatrix() type_trichar_ids = T.imatrix() type_trichar_masks =T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = para.shape[0] paragraph_input = embeddings[para.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len) q_input = embeddings[q.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len) q_types = embeddings[types.flatten()].reshape((true_batch_size, 2, emb_size)).transpose((0, 2,1)) paragraph_input_shape = embeddings[para_shape.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len) q_input_shape = embeddings[q_shape.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len) q_types_shape = embeddings[types_shape.flatten()].reshape((true_batch_size, 2, emb_size)).transpose((0, 2,1)) paragraph_input_trichar = char_embeddings[para_trichar_ids.flatten()].reshape((true_batch_size, para_len*trichar_len, char_emb_size)) #(batch, char_emb_size, para_len*trichar_len) q_input_trichar = char_embeddings[q_trichar_ids.flatten()].reshape((true_batch_size, question_len*trichar_len, char_emb_size)) # (batch, emb_size, question_len) q_types_trichar = char_embeddings[type_trichar_ids.flatten()].reshape((true_batch_size, 2*trichar_len, char_emb_size)) #sum up trichar emb as word level embs paragraph_input_trichar=T.sum((paragraph_input_trichar*para_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, para_len, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,para_len) q_input_trichar=T.sum((q_input_trichar*q_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, question_len, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,q_len) q_types_trichar=T.sum((q_types_trichar*type_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, 2, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,2) #concatenate word emb with shape emb q_input = T.concatenate([q_input,q_input_shape, q_input_trichar],axis=1) #(batch, 2*emb_size+char_emb_size, q_len) paragraph_input = T.concatenate([paragraph_input,paragraph_input_shape, paragraph_input_trichar,islabels.dimshuffle(0,'x',1)],axis=1)#(batch, 2*emb_size+char_emb_size+1, para_len) q_types_input = T.sum(T.concatenate([q_types,q_types_shape,q_types_trichar],axis=1), axis=2) #(batch, 2*emb+char_emb_size) fwd_LSTM_para_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size+1, hidden_size) bwd_LSTM_para_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size+1, hidden_size) paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) paragraph_reps_tensor3=paragraph_model.output_tensor #(batch, 2*hidden, paralen) fwd_LSTM_q_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size, hidden_size) bwd_LSTM_q_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size, hidden_size) question_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(q_input, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) q_reps=questions_model.output_sent_rep_maxpooling #(batch, 2*hidden) #interaction batch_ids=T.arange(true_batch_size) # c_heads=theano.shared(value=np.asarray([(para_len-1)/2]*batch_size, dtype='int32'), borrow=True) c_heads = T.repeat(theano.shared(value=np.asarray([(para_len-1)/2], dtype='int32'), borrow=True), true_batch_size) c_tails=c_heads+1 c_heads_reps=paragraph_reps_tensor3[batch_ids,:,c_heads] #(batch, 2*hidden) c_tails_reps=paragraph_reps_tensor3[batch_ids,:,c_tails] #(batch, 2*hidden) candididates_reps=T.concatenate([c_heads_reps, c_tails_reps], axis=1) #(batch, 4*hidden) context_l=paragraph_model.forward_output[batch_ids,:,c_heads-1] #(batch, hidden) context_r=paragraph_model.backward_output[batch_ids,:,c_tails+1]#(batch, hidden) #glove level average # c_input = embeddings[c_ids.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) # c_input_shape = embeddings[c_ids_shape.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) # c_input = T.concatenate([c_input,c_input_shape],axis=1) c_sum = paragraph_input[:,:-1,(para_len-1)/2]#(batch, 2*emb_size+char_emb) c_sum_with_isInQLabel = paragraph_input[:,:,(para_len-1)/2] # e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len) q_sum = T.sum(q_input*q_mask.dimshuffle(0,'x',1), axis=2) #(batch, 2*emb_size+char_emb_size) # average_Q_batch = q_sum/T.sqrt(T.sum(q_sum**2, axis=1)+1e-20).dimshuffle(0,'x') HL_layer_1_input_size=2*hidden_size+4*hidden_size+(2*emb_size+char_emb_size+1)+(2*emb_size+char_emb_size)+1+hidden_size+hidden_size+(2*emb_size+char_emb_size)+1 cosine_Qtype_cand = cosine_row_wise_twoMatrix(q_types_input, c_sum).dimshuffle(0,'x') #(batch, 1) #, average_E_batch, average_C_batch, average_Q_batch HL_layer_1_input = T.concatenate([q_reps, candididates_reps, c_sum_with_isInQLabel, q_sum, islabels[:,(para_len-1)/2:(para_len-1)/2+1], context_l, context_r, q_types_input, cosine_Qtype_cand], axis=1) HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=HL_hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh) LR_input= T.concatenate([HL_layer_1.output, HL_layer_2.output, islabels[:,(para_len-1)/2:(para_len-1)/2+1], cosine_Qtype_cand], axis=1) #(batch, char_HL_hidden_size+HL_hidden_size) LR_input_size= HL_hidden_size+HL_hidden_size+1+1#HL_layer_1_input_size+2*HL_hidden_size U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 norm_U_a=normalize_matrix(U_a) LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='char_LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=norm_U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = LR_para+[embeddings,char_embeddings]+paragraph_para+question_para+HL_layer_1.params+HL_layer_2.params # load_model_from_file(storePath+'Best_Paras_HS_20170316_0.760357142857', params) # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) # L2_reg = L2norm_paraList(params) cost=loss#+1e-6*L2_reg accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([para, para_shape, para_mask,q,q_shape, q_mask,islabels, labels, types, types_shape, q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks], cost, updates=updates,on_unused_input='ignore') # train_model_pred = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], layer_LR.y_pred, on_unused_input='ignore') test_model = theano.function([para, para_shape, para_mask, q,q_shape, q_mask,islabels, labels, types, types_shape,q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks], [layer_LR.errors(labels),layer_LR.prop_for_posi], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] # n_train_batches_pred=train_size/batch_size_pred #batch_size means how many pairs # train_batch_start_pred=list(np.arange(n_train_batches_pred)*batch_size_pred)+[train_size-batch_size_pred] n_test_batches=test_size/test_batch_size #batch_size means how many pairs n_test_remain=test_size%test_batch_size #batch_size means how many pairs test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_acc=0.0 cost_i=0.0 train_ids = range(train_size) # train_ids_pred = range(train_size) best_test_statistic=defaultdict(int) # best_train_statistic=defaultdict(int) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) # print train_ids[:100] iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_list = train_ids[para_id:para_id+batch_size] # print 'train_labels[train_id_list]:', train_labels[train_id_list] if model_type=='train': #para, para_shape, para_mask,q,q_shape, q_mask,islabels, labels, types, types_shape, q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks cost_i+= train_model( train_paras[train_id_list], train_paras_shape[train_id_list], train_paras_mask[train_id_list], train_questions[train_id_list], train_questions_shape[train_id_list], train_questions_mask[train_id_list], train_islabels[train_id_list], train_labels[train_id_list], train_types[train_id_list], train_types_shape[train_id_list], train_question_trichar_ids[train_id_list], train_question_trichar_masks[train_id_list], train_para_trichar_ids[train_id_list], train_para_trichar_masks[train_id_list], train_type_trichar_ids[train_id_list], train_type_trichar_masks[train_id_list]) #print iter if iter%10 ==0: print 'Epoch ', epoch, 'iter '+str(iter)+'/'+str(len(train_batch_start))+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() print 'Testing...' error=0 test_statistic=defaultdict(int) if model_type=='test': writefile=open(storePath+'predictions_20170317.txt', 'w') for id, test_para_id in enumerate(test_batch_start): test_id_list = range(test_para_id, test_para_id+test_batch_size) # print 'test_id_list:',test_id_list # print 'test_c_heads[test_id_list]', test_c_heads[test_id_list] # gold_labels_list = test_labels_3c[test_para_id:test_para_id+test_batch_size] error_i, preds_i= test_model( test_paras[test_id_list], test_paras_shape[test_id_list], test_paras_mask[test_id_list], test_questions[test_id_list], test_questions_shape[test_id_list], test_questions_mask[test_id_list], test_islabels[test_id_list], test_labels[test_id_list], test_types[test_id_list], test_types_shape[test_id_list], test_question_trichar_ids[test_id_list], test_question_trichar_masks[test_id_list], test_para_trichar_ids[test_id_list], test_para_trichar_masks[test_id_list], test_type_trichar_ids[test_id_list], test_type_trichar_masks[test_id_list]) if model_type=='test': if id < len(test_batch_start)-1: writefile.write('\n'.join(map(str,list(preds_i)))+'\n') else: writefile.write('\n'.join(map(str,list(preds_i)[-n_test_remain:]))+'\n') error+=error_i # for ind, gold_label in enumerate(gold_labels_list): # test_statistic[(gold_label, preds_i[ind])]+=1 if model_type=='test': writefile.close() acc=1.0-error*1.0/len(test_batch_start) # acc= (test_statistic.get((1,1),0)+test_statistic.get((0,0),0))*1.0/(test_statistic.get((1,1),0)+test_statistic.get((0,0),0)+test_statistic.get((1,0),0)+test_statistic.get((0,1),0)) if acc> max_acc: max_acc=acc # best_test_statistic=test_statistic if model_type=='train': store_model_to_file(storePath+'Best_Paras_HS_20170324_'+str(max_acc), params) print 'Finished storing best params at:', max_acc print 'current average acc:', acc, '\t\tmax acc:', max_acc#, '\ttest_statistic:', test_statistic # print '\t\t\t\tbest statistic:', best_test_statistic if model_type=='test': exit(0) if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=10, hidden_size=10, L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=0.217545454546): model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train( para_len_limit, q_len_limit) train_size = len(train_para_list) if train_size != len(train_Q_list) or train_size != len( train_label_list) or train_size != len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test( word2id, para_len_limit, q_len_limit) test_size = len(test_para_list) if test_size != len(test_Q_list) or test_size != len( test_mask) or test_size != len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values = random_value_normal((overall_vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in overall_word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') extraF = T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' norm_extraF = normalize_matrix(extraF) U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size) paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b] UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size) Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] W_a1 = create_ensemble_para( rng, hidden_size, hidden_size) # init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size + 3) # 3 extra features LR_b = theano.shared( value=numpy.zeros((2, ), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras = [W_a1, W_a2, U_a, LR_b] params = [embeddings] + paragraph_para + Q_para + attention_paras load_model_from_file(rootPath + 'Best_Paras_conv_0.217545454545', params) paragraph_input = embeddings[paragraph.flatten()].reshape( (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose( (0, 2, 1)) # (batch_size, emb_size, maxparalen) concate_paragraph_input = T.concatenate( [paragraph_input, norm_extraF.dimshuffle((0, 2, 1))], axis=1) paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size, U=U1, W=W1, b=b1, Ub=U1_b, Wb=W1_b, bb=b1_b) para_reps = paragraph_model.output_tensor #(batch, emb, para_len) # #LSTM # fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters # paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) # para_reps=paragraph_model.output_tensor Qs_emb = embeddings[questions.flatten()].reshape( (questions.shape[0], questions.shape[1], emb_size)).transpose( (0, 2, 1)) #(#questions, emb_size, maxsenlength) questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask( X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ, W=WQ, b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size) questions_reps_tensor = questions_model.output_tensor #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor #use CNN for question modeling # Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen) # conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5)) # Q_conv_para=[conv_W, conv_b] # conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4, # image_shape=(batch_size, 1, emb_size, q_len_limit), # filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b) # conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1) # gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1)) # masked_conv_output=conv_output*gru_mask # questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size)) # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) # def example_in_batch(para_matrix, q_matrix): #assume both are (hidden, len) transpose_para_matrix = para_matrix.T interaction_matrix = T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix = T.nnet.softmax(interaction_matrix) return T.dot(q_matrix, norm_interaction_matrix.T) #(len, para_len) batch_q_reps, updates = theano.scan( fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor ]) #batch_q_reps (batch, hidden, para_len) #attention distributions norm_W_a1 = normalize_matrix(W_a1) norm_W_a2 = normalize_matrix(W_a2) norm_U_a = normalize_matrix(U_a) transformed_para_reps = T.maximum( T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2), 0.0) #relu transformed_q_reps = T.maximum( T.dot(batch_q_reps.transpose((0, 2, 1)), norm_W_a1), 0.0) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both = transformed_para_reps + transformed_q_reps # U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size) # U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size) # accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b] # accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b) # accu_both=accumu_model.output_tensor.transpose((0,2,1)) prior_att = T.concatenate([add_both, norm_extraF], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices = para_mask.flatten().nonzero()[0] layer3 = LogisticRegression(rng, input=prior_att.reshape( (batch_size * prior_att.shape[1], hidden_size + 3)), n_in=hidden_size + 3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.sum( T.log(layer3.p_y_given_x) [valid_indices, labels.flatten()[valid_indices]]) #[T.arange(y.shape[0]), y]) distributions = layer3.p_y_given_x[:, -1].reshape( (batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) # masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask masked_dis = distributions * para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] L2_reg = L2norm_paraList( [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost = error #+ConvGRU_1.error# accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function( [paragraph, questions, labels, para_mask, q_mask, extraF], cost, updates=updates, on_unused_input='ignore') test_model = theano.function( [paragraph, questions, para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size # remain_test=test_size%batch_size test_batch_start = list( numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i += train_model( np.asarray([ train_para_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_Q_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_label_list[id] for id in train_ids[para_id:para_id + batch_size] ], dtype='int32'), np.asarray([ train_para_mask[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX), np.asarray([ train_mask[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX), np.asarray([ train_feature_matrixlist[id] for id in train_ids[para_id:para_id + batch_size] ], dtype=theano.config.floatX)) #print iter if iter % 10 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() exact_match = 0.0 F1_match = 0.0 q_amount = 0 for test_para_id in test_batch_start: distribution_matrix = test_model( np.asarray(test_para_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id + batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX), np.asarray( test_feature_matrixlist[test_para_id:test_para_id + batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list = test_text_list[ test_para_id:test_para_id + batch_size] para_gold_ansset_list = q_ansSet_list[ test_para_id:test_para_id + batch_size] paralist_extra_features = test_feature_matrixlist[ test_para_id:test_para_id + batch_size] sub_para_mask = test_para_mask[test_para_id:test_para_id + batch_size] para_len = len(test_para_wordlist_list[0]) if para_len != len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len( distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount += batch_size # print q_size # print test_para_word_list Q_list_inword = test_Q_list_word[ test_para_id:test_para_id + batch_size] for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans = extract_ansList_attentionList( test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) q_gold_ans_set = para_gold_ansset_list[q] # print test_para_wordlist_list[q] # print Q_list_inword[q] # print pred_ans.encode('utf8'), q_gold_ans_set if pred_ans in q_gold_ans_set: exact_match += 1 F1 = MacroF1(pred_ans, q_gold_ans_set) F1_match += F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc = F1_match / q_amount exact_acc = exact_match / q_amount if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc if max_exact_acc > max_EM: store_model_to_file( rootPath + 'Best_Paras_conv_' + str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=10, filter_size=[3, 5], maxSentLen=40, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence train_sents = np.asarray(all_sentences[0], dtype='int32') train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels = np.asarray(all_labels[0], dtype='int32') train_size = len(train_labels) dev_sents = np.asarray(all_sentences[1], dtype='int32') dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels = np.asarray(all_labels[1], dtype='int32') dev_size = len(dev_labels) test_sents = np.asarray(all_sentences[2], dtype='int32') test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels = np.asarray(all_labels[2], dtype='int32') test_size = len(test_labels) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) NN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size LR_input = T.concatenate([sent_embeddings, sent_embeddings2], axis=1) LR_input_size = hidden_size[0] * 2 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 12, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(layer_LR.before_softmax) #batch * 12 prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix) loss = -T.mean(T.log(prob_pos)) # loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [embeddings ] + NN_para + LR_para # put all model parameters together cost = loss #+Div_reg*diversify_reg#+L2_weight*L2_reg updates = Gradient_Cost_Para(cost, params, learning_rate) ''' testing ''' binarize_prob = T.where(score_matrix > 0.5, 1, 0) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_id_matrix, sents_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] # max_acc_dev=0.0 max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) iter_accu = 0 cost_i = 0.0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size]) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.1, n_epochs=4, L2_weight=0.001, emb_size=70, batch_size=50, filter_size=3, maxSentLen=50, nn='CNN'): hidden_size=emb_size model_options = locals().copy() print "model options", model_options rng = np.random.RandomState(1234) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r,all_labels, word2id =load_SNLI_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_sents_l=np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') test_sents_l=np.asarray(all_sentences_l[2], dtype='int32') train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r=np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') test_sents_r=np.asarray(all_sentences_r[2] , dtype='int32') train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store=np.asarray(all_labels[0], dtype='int32') dev_labels_store=np.asarray(all_labels[1], dtype='int32') test_labels_store=np.asarray(all_labels[2], dtype='int32') train_size=len(train_labels_store) dev_size=len(dev_labels_store) test_size=len(test_labels_store) vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input_l=embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_r=embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)) #conv if nn=='CNN': conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size)) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) NN_para=[conv_W, conv_b] conv_input_l = common_input_l.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_l = Conv_with_input_para(rng, input=conv_input_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_l=conv_model_l.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_l=conv_output_l.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_l=T.repeat(sents_mask_l[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_l=(1.0-mask_for_conv_output_l)*(mask_for_conv_output_l-10) masked_conv_output_l=conv_output_into_tensor3_l+mask_for_conv_output_l #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_l=T.max(masked_conv_output_l, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_input_r = common_input_r.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_r = Conv_with_input_para(rng, input=conv_input_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_r=conv_model_r.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_r=conv_output_r.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_r=T.repeat(sents_mask_r[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_r=(1.0-mask_for_conv_output_r)*(mask_for_conv_output_r-10) masked_conv_output_r=conv_output_into_tensor3_r+mask_for_conv_output_r #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_r=T.max(masked_conv_output_r, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size #GRU if nn=='GRU': U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) NN_para=[U1, W1, b1] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias gru_input_l = common_input_l.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_l=GRU_Batch_Tensor_Input_with_Mask(gru_input_l, sents_mask_l, hidden_size, U1, W1, b1) sent_embeddings_l=gru_layer_l.output_sent_rep # (batch_size, hidden_size) gru_input_r = common_input_r.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_r=GRU_Batch_Tensor_Input_with_Mask(gru_input_r, sents_mask_r, hidden_size, U1, W1, b1) sent_embeddings_r=gru_layer_r.output_sent_rep # (batch_size, hidden_size) #LSTM if nn=='LSTM': LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) NN_para=LSTM_para_dict.values() # .values returns a list of parameters lstm_input_l = common_input_l.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_l=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_l, sents_mask_l, hidden_size, LSTM_para_dict) sent_embeddings_l=lstm_layer_l.output_sent_rep # (batch_size, hidden_size) lstm_input_r = common_input_r.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_r=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_r, sents_mask_r, hidden_size, LSTM_para_dict) sent_embeddings_r=lstm_layer_r.output_sent_rep # (batch_size, hidden_size) HL_layer_1_input = T.concatenate([sent_embeddings_l,sent_embeddings_r, sent_embeddings_l*sent_embeddings_r, cosine_matrix1_matrix2_rowwise(sent_embeddings_l,sent_embeddings_r).dimshuffle(0,'x')],axis=1) HL_layer_1_input_size = hidden_size*3+1 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size, n_out=hidden_size, activation=T.tanh) #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative LR_input_size=HL_layer_1_input_size+2*hidden_size U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] LR_input=T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1) layer_LR=LogisticRegression(rng, input=T.tanh(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [embeddings]+NN_para+LR_para+HL_layer_1.params+HL_layer_2.params # put all model parameters together # L2_reg =L2norm_paraList([embeddings,conv_W, U_a]) # diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix) cost=loss#+Div_reg*diversify_reg#+L2_weight*L2_reg grads = T.grad(cost, params) # create a list of gradients for all model parameters accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #1e-8 is add to get rid of zero division updates.append((acc_i, acc)) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_train_batches=train_size/batch_size train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_dev_batches=dev_size/batch_size dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches=test_size/batch_size test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_acc_dev=0.0 max_acc_test=0.0 while epoch < n_epochs: epoch = epoch + 1 train_indices = range(train_size) random.Random(200).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu=0 cost_i=0.0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_indices[batch_id:batch_id+batch_size] cost_i+= train_model( train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter%500==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0: # print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' # past_time = time.time() error_sum=0.0 for dev_batch_id in dev_batch_start: # for each test batch error_i=dev_model( dev_sents_l[dev_batch_id:dev_batch_id+batch_size], dev_masks_l[dev_batch_id:dev_batch_id+batch_size], dev_sents_r[dev_batch_id:dev_batch_id+batch_size], dev_masks_r[dev_batch_id:dev_batch_id+batch_size], dev_labels_store[dev_batch_id:dev_batch_id+batch_size] ) error_sum+=error_i dev_accuracy=1.0-error_sum/(len(dev_batch_start)) if dev_accuracy > max_acc_dev: max_acc_dev=dev_accuracy print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev #best dev model, do test error_sum=0.0 for test_batch_id in test_batch_start: # for each test batch error_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_labels_store[test_batch_id:test_batch_id+batch_size] ) error_sum+=error_i test_accuracy=1.0-error_sum/(len(test_batch_start)) if test_accuracy > max_acc_test: max_acc_test=test_accuracy print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test else: print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=500, test_batch_size=500, emb_size=300, hidden_size=300, L2_weight=0.0001, margin=0.5, train_size=4000000, test_size=1000, max_context_len=25, max_span_len=7, max_q_len=40, max_EM=0.052): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = np.random.RandomState(23455) word2id,train_questions,train_questions_mask,train_lefts,train_lefts_mask,train_spans,train_spans_mask,train_rights,train_rights_mask=load_SQUAD_hinrich(train_size, max_context_len, max_span_len, max_q_len) test_ground_truth,test_candidates,test_questions,test_questions_mask,test_lefts,test_lefts_mask,test_spans,test_spans_mask,test_rights,test_rights_mask=load_dev_hinrich(word2id, test_size, max_context_len, max_span_len, max_q_len) overall_vocab_size=len(word2id) print 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, np.random.RandomState(1234)) # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() left=T.imatrix() #(2*batch, len) left_mask=T.fmatrix() #(2*batch, len) span=T.imatrix() #(2*batch, span_len) span_mask=T.fmatrix() #(2*batch, span_len) right=T.imatrix() #(2*batch, len) right_mask=T.fmatrix() #(2*batch, len) q=T.imatrix() #(2*batch, len_q) q_mask=T.fmatrix() #(2*batch, len_q) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) GRU1_para=[U1, W1, b1, U1_b, W1_b, b1_b] U2, W2, b2=create_GRU_para(rng, hidden_size, hidden_size) U2_b, W2_b, b2_b=create_GRU_para(rng, hidden_size, hidden_size) GRU2_para=[U2, W2, b2, U2_b, W2_b, b2_b] W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) attend_para=[W_a1, W_a2] params = [embeddings]+GRU1_para+attend_para+GRU2_para # load_model_from_file(rootPath+'Best_Para_dim'+str(emb_size), params) left_input = embeddings[left.flatten()].reshape((left.shape[0], left.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context) span_input = embeddings[span.flatten()].reshape((span.shape[0], span.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_span) right_input = embeddings[right.flatten()].reshape((right.shape[0], right.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context) q_input = embeddings[q.flatten()].reshape((q.shape[0], q.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_q) left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=left_input, Mask=left_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) left_reps=left_model.output_tensor #(batch, emb, para_len) span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=span_input, Mask=span_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) span_reps=span_model.output_tensor #(batch, emb, para_len) right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=right_input, Mask=right_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) right_reps=right_model.output_tensor #(batch, emb, para_len) q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) q_reps=q_model.output_tensor #(batch, emb, para_len) #interaction left_reps_via_q_reps, q_reps_via_left_reps=attention_dot_prod_between_2tensors(left_reps, q_reps) span_reps_via_q_reps, q_reps_via_span_reps=attention_dot_prod_between_2tensors(span_reps, q_reps) right_reps_via_q_reps, q_reps_via_right_reps=attention_dot_prod_between_2tensors(right_reps, q_reps) # q_reps_via_left_reps=attention_dot_prod_between_2tensors(q_reps, left_reps) # q_reps_via_span_reps=attention_dot_prod_between_2tensors(q_reps, span_reps) # q_reps_via_right_reps=attention_dot_prod_between_2tensors(q_reps, right_reps) #combine origin_W=normalize_matrix(W_a1) attend_W=normalize_matrix(W_a2) left_origin_reps=T.dot(left_reps.dimshuffle(0, 2,1), origin_W) span_origin_reps=T.dot(span_reps.dimshuffle(0, 2,1), origin_W) right_origin_reps=T.dot(right_reps.dimshuffle(0, 2,1), origin_W) q_origin_reps=T.dot(q_reps.dimshuffle(0, 2,1), origin_W) left_attend_q_reps=T.dot(q_reps_via_left_reps.dimshuffle(0, 2,1), attend_W) span_attend_q_reps=T.dot(q_reps_via_span_reps.dimshuffle(0, 2,1), attend_W) right_attend_q_reps=T.dot(q_reps_via_right_reps.dimshuffle(0, 2,1), attend_W) q_attend_left_reps=T.dot(left_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) q_attend_span_reps=T.dot(span_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) q_attend_right_reps=T.dot(right_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) add_left=left_origin_reps+q_attend_left_reps #(2*batch, len ,hidden) add_span=span_origin_reps+q_attend_span_reps add_right=right_origin_reps+q_attend_right_reps add_q_by_left=q_origin_reps+left_attend_q_reps add_q_by_span=q_origin_reps+span_attend_q_reps add_q_by_right=q_origin_reps+right_attend_q_reps #second GRU add_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_left.dimshuffle(0,2,1), Mask=left_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_left_reps=add_left_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_span.dimshuffle(0,2,1), Mask=span_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_span_reps=add_span_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_right.dimshuffle(0,2,1), Mask=right_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_right_reps=add_right_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_left.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_left_reps=add_q_by_left_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_span.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_span_reps=add_q_by_span_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_right.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_right_reps=add_q_by_right_model.output_sent_rep_maxpooling #(batch, hidden_dim) paragraph_concat=T.concatenate([add_left_reps, add_span_reps, add_right_reps], axis=1) #(batch, 3*hidden) question_concat=T.concatenate([add_q_by_left_reps, add_q_by_span_reps, add_q_by_right_reps], axis=1) #(batch, 3*hidden) simi_list=cosine_row_wise_twoMatrix(paragraph_concat, question_concat) #(2*batch) pos_simi_vec=simi_list[::2] neg_simi_vec=simi_list[1::2] raw_loss=T.maximum(0.0, margin+neg_simi_vec-pos_simi_vec) #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=T.sum(raw_loss)#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], simi_list, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs remain_train=train_size%batch_size # train_batch_start=list(np.arange(n_train_batches)*batch_size*2)+[train_size*2-batch_size*2] # always ou shu if remain_train>0: train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] else: train_batch_start=list(np.arange(n_train_batches)*batch_size) max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_odd_ids = list(np.arange(train_size)*2) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_odd_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_list=[[train_odd_id, train_odd_id+1] for train_odd_id in train_odd_ids[para_id:para_id+batch_size]] train_id_list=sum(train_id_list,[]) # print train_id_list cost_i+= train_model( np.asarray([train_lefts[id] for id in train_id_list], dtype='int32'), np.asarray([train_lefts_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_spans[id] for id in train_id_list], dtype='int32'), np.asarray([train_spans_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_rights[id] for id in train_id_list], dtype='int32'), np.asarray([train_rights_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_questions[id] for id in train_id_list], dtype='int32'), np.asarray([train_questions_mask[id] for id in train_id_list], dtype=theano.config.floatX)) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() exact_match=0.0 F1_match=0.0 for test_pair_id in range(test_size): test_example_lefts=test_lefts[test_pair_id] test_example_lefts_mask=test_lefts_mask[test_pair_id] test_example_spans=test_spans[test_pair_id] test_example_spans_mask=test_spans_mask[test_pair_id] test_example_rights=test_rights[test_pair_id] test_example_rights_mask=test_rights_mask[test_pair_id] test_example_questions=test_questions[test_pair_id] test_example_questions_mask=test_questions_mask[test_pair_id] test_example_candidates=test_candidates[test_pair_id] test_example_size=len(test_example_lefts) # print 'test_pair_id, test_example_size:', test_pair_id, test_example_size if test_example_size < test_batch_size: #pad pad_size=test_batch_size-test_example_size test_example_lefts+=test_example_lefts[-1:]*pad_size test_example_lefts_mask+=test_example_lefts_mask[-1:]*pad_size test_example_spans+=test_example_spans[-1:]*pad_size test_example_spans_mask+=test_example_spans_mask[-1:]*pad_size test_example_rights+=test_example_rights[-1:]*pad_size test_example_rights_mask+=test_example_rights_mask[-1:]*pad_size test_example_questions+=test_example_questions[-1:]*pad_size test_example_questions_mask+=test_example_questions_mask[-1:]*pad_size test_example_candidates+=test_example_candidates[-1:]*pad_size test_example_size=test_batch_size n_test_batches=test_example_size/test_batch_size n_test_remain=test_example_size%test_batch_size if n_test_remain > 0: test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_example_size-test_batch_size] else: test_batch_start=list(np.arange(n_test_batches)*test_batch_size) all_simi_list=[] all_cand_list=[] for test_para_id in test_batch_start: simi_return_vector=test_model( np.asarray(test_example_lefts[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_lefts_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_spans[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_spans_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_rights[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_rights_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_questions[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_questions_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX)) candidate_list=test_example_candidates[test_para_id:test_para_id+test_batch_size] all_simi_list+=list(simi_return_vector) all_cand_list+=candidate_list top1_cand=all_cand_list[np.argsort(all_simi_list)[-1]] # print top1_cand, test_ground_truth[test_pair_id] if top1_cand == test_ground_truth[test_pair_id]: exact_match+=1 F1=macrof1(top1_cand, test_ground_truth[test_pair_id]) # print '\t\t\t', F1 F1_match+=F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc=F1_match/test_size exact_acc=exact_match/test_size if F1_acc> max_F1_acc: max_F1_acc=F1_acc # store_model_to_file(params, emb_size) if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Para_'+str(max_EM), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=10, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size=[5, 5, 5, 5, 5], char_filter_size=5, margin=0.85, extra_size=5 + 11, extra_emb=10, distance=10, distance_emb=10, comment='add distance embs'): #extra_size=3+46+7 test_batch_size = batch_size model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) word2id = {} char2id = {} train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras, word2id, char2id = load_squad_cnn_rank_span_word_train( word2id, char2id, p_len_limit, q_len_limit, char_len) test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_span_word_dev( word2id, char2id, test_p_len_limit, q_len_limit, char_len) ''' #store variables into file ''' # train_variables = [train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras] # test_variables =[test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list] # with open(rootPath+'extra.3.pickle', 'wb') as f: # Python 3: open(..., 'wb') # cPickle.dump(train_variables+test_variables, f, protocol=cPickle.HIGHEST_PROTOCOL) # f.close() # print 'variable stored successfully' # exit(0) ''' load variables from file ''' # before_load_time = time.time() # with open(rootPath+'extra.3.pickle', 'rb') as f: # Python 3: open(..., 'rb') # train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras,test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = cPickle.load(f) # f.close() # print 'load data variables successfully, spend: ', (time.time()-before_load_time)/60.0, ' mins' train_size = len(train_para_list) test_size = len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_para_extras = numpy.asarray(train_para_extras, dtype=theano.config.floatX) train_span_label_list = numpy.asarray(train_span_label_list, dtype='int32') train_word_label_list = numpy.asarray(train_word_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) test_para_extras = numpy.asarray(test_para_extras, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, rng) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_glove() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, rng) char_embeddings = theano.shared(value=char_rand_values, borrow=True) extra_rand_values = random_value_normal((extra_size, extra_emb), theano.config.floatX, rng) extra_embeddings = theano.shared(value=extra_rand_values, borrow=True) distance_rand_values = random_value_normal( (2 * distance + 1, distance_emb), theano.config.floatX, rng) distance_embeddings = theano.shared(value=distance_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') span_indices = T.ivector() #batch word_indices = T.imatrix() #(batch, 2) ans_indices = T.ivector() # for one batch, the length is dynamic para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') extra = T.ftensor3() #(batch, p_len, 3) char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask = T.fmatrix() char_q_mask = T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] extra_rep_batch = T.concatenate( [extra.dot(extra_embeddings), extra], axis=2) #(batch, p_len, extra_emb+extra_size) zero_pad = T.zeros((true_batch_size, 1, extra_emb + extra_size)) left_context = T.concatenate([zero_pad, extra_rep_batch[:, :-1, :]], axis=1) #(batch, p_len, extra_emb+extra_size) right_context = T.concatenate( [extra_rep_batch[:, 1:, :], zero_pad], axis=1) #(batch, p_len, extra_emb+extra_size) left_context_2 = T.concatenate( [zero_pad, zero_pad, extra_rep_batch[:, :-2, :]], axis=1) #(batch, p_len, extra_emb+extra_size) right_context_2 = T.concatenate( [extra_rep_batch[:, 2:, :], zero_pad, zero_pad], axis=1) #(batch, p_len, extra_emb+extra_size) simi2left = T.sum(extra_rep_batch * left_context, axis=2).dimshuffle(0, 1, 'x') #(batch, p_len, 1) simi2right = T.sum(extra_rep_batch * right_context, axis=2).dimshuffle(0, 1, 'x') #(batch, p_len, 1) cos2left = cosine_tensor3(extra_rep_batch, left_context, 2).dimshuffle(0, 1, 'x') cos2right = cosine_tensor3(extra_rep_batch, right_context, 2).dimshuffle(0, 1, 'x') diff2left = extra_rep_batch - left_context diff2right = extra_rep_batch - right_context #(batch, p_len, extra_emb+extra_size) extra_rep_batch = T.concatenate( [ extra_rep_batch, left_context, right_context, left_context_2, right_context_2, diff2left, diff2right, simi2left, simi2right, cos2left, cos2right ], axis=2) #batch, p_len, 7*(extra_emb+extra_size)+4) true_extra_size = 7 * (extra_emb + extra_size) + 4 common_input_p = embeddings[paragraph.flatten()].reshape( (true_batch_size, true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q = embeddings[questions.flatten()].reshape( (true_batch_size, q_len_limit, emb_size)) char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape( (true_batch_size * true_p_len, char_len, char_emb_size )) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q = char_embeddings[char_questions.flatten()].reshape( (true_batch_size * q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape( (true_batch_size * true_p_len, char_len)) char_q_masks = char_q_mask.reshape( (true_batch_size * q_len_limit, char_len)) conv_W_char, conv_b_char = create_conv_para( rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1 = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size + true_extra_size, filter_size[0])) conv_W_2, conv_b_2 = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_3, conv_b_3 = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[2])) # conv_W_4, conv_b_4=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3])) # conv_W_5, conv_b_5=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4])) conv_W_1_q, conv_b_1_q = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_3_q, conv_b_3_q = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[2])) # conv_W_4_q, conv_b_4_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3])) # conv_W_5_q, conv_b_5_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4])) CNN_para = [ conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_3, conv_b_3, conv_W_3_q, conv_b_3_q, # conv_W_4, conv_b_4, conv_W_5, conv_b_5,conv_W_4_q, conv_b_4_q, conv_W_5_q, conv_b_5_q, conv_W_char, conv_b_char ] span_input4score, word_input4score, overall_span_hidden_size, overall_word_hidden_size = squad_cnn_rank_spans_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, batch_size, p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, conv_W_3, conv_b_3, conv_W_3_q, conv_b_3_q, # conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q, # conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q, para_mask, q_mask, char_p_masks, char_q_masks, extra_rep_batch, true_extra_size) test_span_input4score, test_word_input4score, _, _ = squad_cnn_rank_spans_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, test_batch_size, test_p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, conv_W_3, conv_b_3, conv_W_3_q, conv_b_3_q, # conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q, # conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q, para_mask, q_mask, char_p_masks, char_q_masks, extra_rep_batch, true_extra_size) #(batch, hidden, gram_size) gram_size = 5 * true_p_len - (0 + 1 + 2 + 3 + 4) # U_a = create_ensemble_para(rng, 1, 4*hidden_size) # norm_U_a=normalize_matrix(U_a) # span_scores_matrix=T.dot(span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((batch_size, gram_size)) #(batch, 13*para_len-78, 1) span_HL_1_para = create_ensemble_para(rng, hidden_size, overall_span_hidden_size) span_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) span_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) span_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) span_U_a = create_ensemble_para(rng, 1, hidden_size + overall_span_hidden_size) norm_span_U_a = normalize_matrix(span_U_a) norm_span_HL_1_para = normalize_matrix(span_HL_1_para) norm_span_HL_2_para = normalize_matrix(span_HL_2_para) norm_span_HL_3_para = normalize_matrix(span_HL_3_para) norm_span_HL_4_para = normalize_matrix(span_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(span_input4score, norm_span_HL_1_para, norm_span_HL_2_para, norm_span_HL_3_para, norm_span_HL_4_para, norm_span_U_a, batch_size, gram_size) span_scores = T.nnet.softmax(span_scores_matrix) #(batch, 7*para_len-21) loss_neg_likelihood = -T.mean( T.log(span_scores[T.arange(batch_size), span_indices])) #ranking loss tanh_span_scores_matrix = span_scores #T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, gram_size), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor( index_matrix[T.arange(batch_size), span_indices], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0 - new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega)) span_loss = loss_neg_likelihood + loss_rank # test_span_scores_matrix=T.dot(test_span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((true_batch_size, gram_size)) #(batch, 13*para_len-78) test_span_scores_matrix = add_HLs_2_tensor3( test_span_input4score, norm_span_HL_1_para, norm_span_HL_2_para, norm_span_HL_3_para, norm_span_HL_4_para, norm_span_U_a, true_batch_size, gram_size) #word HL_1_para = create_ensemble_para(rng, hidden_size, overall_word_hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) start_U_a = create_ensemble_para(rng, 1, hidden_size + overall_word_hidden_size) norm_start_U_a = normalize_matrix(start_U_a) norm_HL_1_para = normalize_matrix(HL_1_para) norm_HL_2_para = normalize_matrix(HL_2_para) norm_HL_3_para = normalize_matrix(HL_3_para) norm_HL_4_para = normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para( rng, hidden_size, overall_word_hidden_size + distance_emb) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para( rng, 1, hidden_size + overall_word_hidden_size + distance_emb) end_norm_U_a = normalize_matrix(end_U_a) end_norm_HL_1_para = normalize_matrix(end_HL_1_para) end_norm_HL_2_para = normalize_matrix(end_HL_2_para) end_norm_HL_3_para = normalize_matrix(end_HL_3_para) end_norm_HL_4_para = normalize_matrix(end_HL_4_para) start_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_start_U_a, batch_size, true_p_len) start_scores = T.nnet.softmax(start_scores_matrix) #(batch, para_len) ''' forward start info to end prediction ''' distance_matrix = word_indices[:, 0].dimshuffle( 0, 'x') - T.arange(true_p_len).dimshuffle('x', 0) #(batch, p_len) distance_trunc_matrix = T.maximum( -distance, T.minimum(distance, distance_matrix)) + distance #(batch, p_len) zero_distance_matrix = T.zeros( (true_batch_size * true_p_len, 2 * distance + 1)) filled_distance_matrix = T.set_subtensor( zero_distance_matrix[T.arange(true_batch_size * true_p_len), distance_trunc_matrix.flatten()], 1.0) filled_distance_tensor3 = filled_distance_matrix.reshape( (true_batch_size, true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle( 0, 2, 1) #(batch_size, distance_emb, p_len) end_word_input4score = T.concatenate( [word_input4score, filled_distance_tensor3], axis=1) #(batch, +distance_emb, p_len) end_scores_matrix = add_HLs_2_tensor3(end_word_input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, batch_size, true_p_len) end_scores = T.nnet.softmax(end_scores_matrix) #(batch, para_len) start_loss_neg_likelihood = -T.mean( T.log(start_scores[T.arange(batch_size), word_indices[:, 0]])) end_loss_neg_likelihood = -T.mean( T.log(end_scores[T.arange(batch_size), word_indices[:, 1]])) #ranking loss start tanh_start_scores_matrix = start_scores #T.tanh(span_scores_matrix) #(batch, gram_size) start_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) start_new_index_matrix = T.set_subtensor( start_index_matrix[T.arange(batch_size), word_indices[:, 0]], 1.0) start_prob_batch_posi = tanh_start_scores_matrix[ start_new_index_matrix.nonzero()] start_prob_batch_nega = tanh_start_scores_matrix[( 1.0 - start_new_index_matrix).nonzero()] start_repeat_posi = T.extra_ops.repeat(start_prob_batch_posi, start_prob_batch_nega.shape[0], axis=0) start_repeat_nega = T.extra_ops.repeat(start_prob_batch_nega.dimshuffle( 'x', 0), start_prob_batch_posi.shape[0], axis=0).flatten() start_loss_rank = T.mean( T.maximum(0.0, margin - start_repeat_posi + start_repeat_nega)) #ranking loss END end_tanh_scores_matrix = end_scores #T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor( end_index_matrix[T.arange(batch_size), word_indices[:, 1]], 1.0) end_prob_batch_posi = end_tanh_scores_matrix[ end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_scores_matrix[( 1.0 - end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle( 'x', 0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean( T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega)) word_loss = start_loss_neg_likelihood + end_loss_neg_likelihood + start_loss_rank + end_loss_rank #test test_start_scores_matrix = add_HLs_2_tensor3( test_word_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_start_U_a, true_batch_size, true_p_len) #(batch, test_p_len) mask_test_start_return = test_start_scores_matrix * para_mask #(batch, p_len) ''' forward start info to end prediction in testing ''' test_distance_matrix = T.argmax(mask_test_start_return, axis=1).dimshuffle( 0, 'x') - T.arange(true_p_len).dimshuffle('x', 0) #(batch, p_len) test_distance_trunc_matrix = T.maximum( -distance, T.minimum(distance, test_distance_matrix)) + distance #(batch, p_len) test_zero_distance_matrix = T.zeros( (true_batch_size * true_p_len, 2 * distance + 1)) test_filled_distance_matrix = T.set_subtensor( test_zero_distance_matrix[T.arange(true_batch_size * true_p_len), test_distance_trunc_matrix.flatten()], 1.0) test_filled_distance_tensor3 = test_filled_distance_matrix.reshape( (true_batch_size, true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle( 0, 2, 1) #(batch_size, distance_emb, p_len) test_end_word_input4score = T.concatenate( [test_word_input4score, test_filled_distance_tensor3], axis=1) #(batch, +distance-emb, p_len) end_test_scores_matrix = add_HLs_2_tensor3( test_end_word_input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size, true_p_len) #(batch, test_p_len) end_mask_test_return = end_test_scores_matrix * para_mask #(batch, p_len) word_gram_1 = mask_test_start_return + end_mask_test_return word_gram_2 = mask_test_start_return[:, : -1] + end_mask_test_return[:, 1:] #(batch* hidden_size, maxsenlen-1) word_gram_3 = mask_test_start_return[:, : -2] + end_mask_test_return[:, 2:] #(batch* hidden_size, maxsenlen-2) word_gram_4 = mask_test_start_return[:, : -3] + end_mask_test_return[:, 3:] #(batch* hidden_size, maxsenlen-3) word_gram_5 = mask_test_start_return[:, : -4] + end_mask_test_return[:, 4:] #(batch* hidden_size, maxsenlen-4) word_pair_scores = T.concatenate( [word_gram_1, word_gram_2, word_gram_3, word_gram_4, word_gram_5], axis=1) #(batch_size, gram_size) #ans words train ans_HL_1_para = create_ensemble_para(rng, hidden_size, overall_word_hidden_size) ans_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) ans_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) ans_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) ans_U_a = create_ensemble_para(rng, 1, hidden_size + overall_word_hidden_size) norm_ans_U_a = normalize_matrix(ans_U_a) norm_ans_HL_1_para = normalize_matrix(ans_HL_1_para) norm_ans_HL_2_para = normalize_matrix(ans_HL_2_para) norm_ans_HL_3_para = normalize_matrix(ans_HL_3_para) norm_ans_HL_4_para = normalize_matrix(ans_HL_4_para) ans_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_ans_HL_1_para, norm_ans_HL_2_para, norm_ans_HL_3_para, norm_ans_HL_4_para, norm_ans_U_a, batch_size, true_p_len) ans_scores_vec = T.nnet.softmax( ans_scores_matrix).flatten() #(batch, para_len) ans_loss_neg_likelihood = -T.mean(T.log(ans_scores_vec[ans_indices])) ans_index_vec = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX).flatten() ans_new_index = T.set_subtensor(ans_index_vec[ans_indices], 1.0) ans_prob_batch_posi = ans_scores_vec[ans_new_index.nonzero()] ans_prob_batch_nega = ans_scores_vec[(1.0 - ans_new_index).nonzero()] ans_repeat_posi = T.extra_ops.repeat(ans_prob_batch_posi, ans_prob_batch_nega.shape[0], axis=0) ans_repeat_nega = T.extra_ops.repeat(ans_prob_batch_nega.dimshuffle( 'x', 0), ans_prob_batch_posi.shape[0], axis=0).flatten() ans_loss_rank = T.mean( T.maximum(0.0, margin - ans_repeat_posi + ans_repeat_nega)) ans_loss = ans_loss_neg_likelihood + ans_loss_rank #ans words test test_ans_scores_matrix = add_HLs_2_tensor3( test_word_input4score, norm_ans_HL_1_para, norm_ans_HL_2_para, norm_ans_HL_3_para, norm_ans_HL_4_para, norm_ans_U_a, true_batch_size, true_p_len) test_ans_scores_matrix = test_ans_scores_matrix * para_mask #T.nnet.softmax(test_ans_scores_matrix) #(batch, para_len) ans_gram_1 = test_ans_scores_matrix ans_gram_2 = (test_ans_scores_matrix[:, :-1] + test_ans_scores_matrix[:, 1:] ) / 2.0 #(batch* hidden_size, maxsenlen-1) ans_gram_3 = (test_ans_scores_matrix[:, :-2] + test_ans_scores_matrix[:, 1:-1] + test_ans_scores_matrix[:, 2:] ) / 3.0 #(batch* hidden_size, maxsenlen-2) ans_gram_4 = ( test_ans_scores_matrix[:, :-3] + test_ans_scores_matrix[:, 1:-2] + test_ans_scores_matrix[:, 2:-1] + test_ans_scores_matrix[:, 3:] ) / 4.0 #(batch* hidden_size, maxsenlen-3) ans_gram_5 = ( test_ans_scores_matrix[:, :-4] + test_ans_scores_matrix[:, 1:-3] + test_ans_scores_matrix[:, 2:-2] + test_ans_scores_matrix[:, 3:-1] + test_ans_scores_matrix[:, 4:]) / 5.0 #(batch* hidden_size, maxsenlen-4) ans_word_scores = T.concatenate( [ans_gram_1, ans_gram_2, ans_gram_3, ans_gram_4, ans_gram_5], axis=1) #(batch, hidden_size, maxsenlen-(0+1+2+3+4)) ''' form test spans and masks ''' test_span_word_scores_matrix = word_pair_scores + ans_word_scores #test_span_scores_matrix+ test_spans_mask_1 = para_mask test_spans_mask_2 = para_mask[:, : -1] * para_mask[:, 1:] #(batch* hidden_size, maxsenlen-1) test_spans_mask_3 = para_mask[:, : -2] * para_mask[:, 1: -1] * para_mask[:, 2:] #(batch* hidden_size, maxsenlen-2) test_spans_mask_4 = para_mask[:, : -3] * para_mask[:, 1: -2] * para_mask[:, 2: -1] * para_mask[:, 3:] #(batch* hidden_size, maxsenlen-3) test_spans_mask_5 = para_mask[:, : -4] * para_mask[:, 1: -3] * para_mask[:, 2: -2] * para_mask[:, 3: -1] * para_mask[:, 4:] test_spans_mask = T.concatenate([ test_spans_mask_1, test_spans_mask_2, test_spans_mask_3, test_spans_mask_4, test_spans_mask_5 ], axis=1) #(batch, 5*p_len -) # test_return=T.argmax(test_span_word_scores_matrix, axis=1) #batch T.argmax(test_span_word_scores_matrix*test_spans_mask, axis=1) #batch test_return = T.argmax(test_span_word_scores_matrix * test_spans_mask, axis=1) #batch # params = [embeddings,char_embeddings]+NN_para+[U_a] params = ( [embeddings, char_embeddings, extra_embeddings, distance_embeddings] + CNN_para # +[span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para] + [start_U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] + [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para] + [ans_U_a, ans_HL_1_para, ans_HL_2_para, ans_HL_3_para, ans_HL_4_para]) L2_reg = L2norm_paraList([ embeddings, char_embeddings, extra_embeddings, distance_embeddings, conv_W_1, conv_W_2, conv_W_1_q, conv_W_2_q, conv_W_char, conv_W_3, conv_W_3_q, # conv_W_4, conv_W_5,conv_W_4_q, conv_W_5_q, # span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para, start_U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para, end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para, ans_U_a, ans_HL_1_para, ans_HL_2_para, ans_HL_3_para, ans_HL_4_para ]) #L2_reg = L2norm_paraList(params) cost = word_loss + ans_loss + L2_weight * L2_reg #span_loss+ accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([ paragraph, questions, span_indices, word_indices, ans_indices, para_mask, q_mask, extra, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len ], cost, updates=updates, on_unused_input='ignore') test_model = theano.function([ paragraph, questions, para_mask, q_mask, extra, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len ], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / test_batch_size # remain_test=test_size%batch_size test_batch_start = list(numpy.arange(n_test_batches) * test_batch_size) + [test_size - test_batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.Random(200).shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_ids[para_id:para_id + batch_size] boundary_labels_batch = train_word_label_list[train_id_batch] ans_label_list = [] for i in range(batch_size): start = boundary_labels_batch[i][0] + i * p_len_limit end = boundary_labels_batch[i][1] + i * p_len_limit ans_label_list += range(start, end + 1) ans_label_list = numpy.asarray(ans_label_list, dtype='int32') cost_i += train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_span_label_list[train_id_batch], boundary_labels_batch, ans_label_list, train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_extras[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() pred_dict = {} q_amount = 0 for test_para_id in test_batch_start: batch_predict_ids = test_model( test_para_list[test_para_id:test_para_id + test_batch_size], test_Q_list[test_para_id:test_para_id + test_batch_size], test_para_mask[test_para_id:test_para_id + test_batch_size], test_Q_mask[test_para_id:test_para_id + test_batch_size], test_para_extras[test_para_id:test_para_id + test_batch_size], test_para_char_list[test_para_id:test_para_id + test_batch_size], test_Q_char_list[test_para_id:test_para_id + test_batch_size], test_para_char_mask[test_para_id:test_para_id + test_batch_size], test_Q_char_mask[test_para_id:test_para_id + test_batch_size], test_p_len_limit) test_para_wordlist_batch = test_para_wordlist_list[ test_para_id:test_para_id + test_batch_size] q_ids_batch = q_idlist[test_para_id:test_para_id + test_batch_size] q_amount += test_batch_size for q in range(test_batch_size): #for each question pred_ans = decode_predict_id( batch_predict_ids[q], test_para_wordlist_batch[q]) q_id = q_ids_batch[q] pred_dict[q_id] = pred_ans # print q_id, test_para_wordlist_batch[q],'\t',pred_ans with codecs.open(rootPath + 'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json', rootPath + 'predictions.txt') if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_exact_acc
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=70, filter_size=[3, 1], maxSentLen=70, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(all_sentences_l[2], dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(all_sentences_r[2], dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(all_labels[2], dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered" "words keep random initialization" rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' 'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in' 'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) ' embed_input_l = init_embeddings[sents_ids_l.flatten( )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) '''create parameters for attentive convolution function ''' gate_filter_shape = (emb_size, 1, emb_size, 1) conv_W_pre, conv_b_pre = create_conv_para(rng, filter_shape=gate_filter_shape) conv_W_gate, conv_b_gate = create_conv_para(rng, filter_shape=gate_filter_shape) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W2_context, conv_b2_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, conv_W_context, conv_W_pre, conv_b_pre, conv_W_gate, conv_b_gate, conv_W2, conv_b2, conv_W2_context ] "A gated convolution layer to form more expressive word representations in each sentence" "input tensor3 (batch_size, emb_size, sen_length), output tensor3 (batch_size, emb_size, sen_length)" conv_layer_gate_l = Conv_with_Mask_with_Gate( rng, input_tensor3=embed_input_l, mask_matrix=sents_mask_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_pre, b=conv_b_pre, W_gate=conv_W_gate, b_gate=conv_b_gate) conv_layer_gate_r = Conv_with_Mask_with_Gate( rng, input_tensor3=embed_input_r, mask_matrix=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_pre, b=conv_b_pre, W_gate=conv_W_gate, b_gate=conv_b_gate) ''' attentive convolution function, two sizes of filter_width 3&1 are used. Multi-channel ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=conv_layer_gate_l.output_tensor3, input_tensor3_r=conv_layer_gate_r.output_tensor3, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=conv_layer_gate_l.output_tensor3, input_tensor3_r=conv_layer_gate_r.output_tensor3, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W2, b=conv_b2, W_context=conv_W2_context, b_context=conv_b2_context) attentive_sent_embeddings_l2 = attentive_conv_layer2.attentive_maxpool_vec_l attentive_sent_embeddings_r2 = attentive_conv_layer2.attentive_maxpool_vec_r "Batch normalization for the four output sentence representation vectors" gamma = theano.shared(np.asarray(rng.uniform( low=-1.0 / math.sqrt(hidden_size[0]), high=1.0 / math.sqrt(hidden_size[0]), size=(hidden_size[0])), dtype=theano.config.floatX), borrow=True) beta = theano.shared(np.zeros((hidden_size[0]), dtype=theano.config.floatX), borrow=True) bn_params = [gamma, beta] bn_attentive_sent_embeddings_l = batch_normalization( inputs=attentive_sent_embeddings_l, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_l.mean((0, ), keepdims=True), std=attentive_sent_embeddings_l.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_r = batch_normalization( inputs=attentive_sent_embeddings_r, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_r.mean((0, ), keepdims=True), std=attentive_sent_embeddings_r.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_l2 = batch_normalization( inputs=attentive_sent_embeddings_l2, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_l2.mean((0, ), keepdims=True), std=attentive_sent_embeddings_l2.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_r2 = batch_normalization( inputs=attentive_sent_embeddings_r2, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_r2.mean((0, ), keepdims=True), std=attentive_sent_embeddings_r2.std((0, ), keepdims=True), mode='low_mem') "Before logistic regression layer, we insert a hidden layer. Now form input to HL classifier" HL_layer_1_input = T.concatenate([ bn_attentive_sent_embeddings_l, bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l + bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l * bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l2, bn_attentive_sent_embeddings_r2, bn_attentive_sent_embeddings_l2 + bn_attentive_sent_embeddings_r2, bn_attentive_sent_embeddings_l2 * bn_attentive_sent_embeddings_r2 ], axis=1) HL_layer_1_input_size = 8 * hidden_size[0] "Create hidden layer parameters" HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, HL_layer_1_input_size, hidden_size[1]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] "Hidden Layer and batch norm to its output again" HL_layer_1 = HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[1], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) gamma_HL = theano.shared(np.asarray(rng.uniform( low=-1.0 / math.sqrt(hidden_size[1]), high=1.0 / math.sqrt(hidden_size[1]), size=(hidden_size[1])), dtype=theano.config.floatX), borrow=True) beta_HL = theano.shared(np.zeros((hidden_size[1]), dtype=theano.config.floatX), borrow=True) bn_params_HL = [gamma_HL, beta_HL] bn_HL_output = batch_normalization(inputs=HL_layer_1.output, gamma=gamma_HL, beta=beta_HL, mean=HL_layer_1.output.mean( (0, ), keepdims=True), std=HL_layer_1.output.std( (0, ), keepdims=True), mode='low_mem') "Form input to LR classifier" LR_input = T.concatenate([HL_layer_1_input, bn_HL_output], axis=1) LR_input_size = HL_layer_1_input_size + hidden_size[1] U_a = create_ensemble_para(rng, 3, LR_input_size) # (input_size, 3) LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] "Logistic Regression layer" layer_LR = LogisticRegression( rng, input=normalize_matrix_col_wise(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [ init_embeddings ] + NN_para + LR_para + bn_params + HL_layer_1_params + bn_params_HL cost = loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) if (epoch == 1 and iter % 1000 == 0) or (epoch >= 2 and iter % 5 == 0): print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() dev_error_sum = 0.0 for dev_batch_id in dev_batch_start: # for each test batch dev_error_i = dev_model( dev_sents_l[dev_batch_id:dev_batch_id + batch_size], dev_masks_l[dev_batch_id:dev_batch_id + batch_size], dev_sents_r[dev_batch_id:dev_batch_id + batch_size], dev_masks_r[dev_batch_id:dev_batch_id + batch_size], dev_labels_store[dev_batch_id:dev_batch_id + batch_size]) dev_error_sum += dev_error_i dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start)) if dev_acc > max_acc_dev: max_acc_dev = dev_acc print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev ''' best dev model, test ''' error_sum = 0.0 for test_batch_id in test_batch_start: # for each test batch error_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i test_acc = 1.0 - error_sum / (len(test_batch_start)) if test_acc > max_acc_test: max_acc_test = test_acc print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test else: print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5], char_filter_size=3, margin=2.0, max_EM=50.302743615): test_batch_size=batch_size*10 model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) word2id={} char2id={} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len) train_size=len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size=len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices= T.imatrix() #batch, (start, end) for each sample para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask=T.fmatrix() char_q_mask=T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size)) char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len)) char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len)) conv_W_char, conv_b_char=create_conv_para(rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0])) conv_W_2, conv_b_2=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_1_q, conv_b_1_q=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) NN_para=[conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char] input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,batch_size, p_len_limit,q_len_limit, emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size, conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char, para_mask, q_mask, char_p_masks,char_q_masks) test_input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,test_batch_size, test_p_len_limit,q_len_limit, emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size, conv_W_1, conv_b_1,conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char, para_mask, q_mask, char_p_masks,char_q_masks) #(batch, hidden, #(batch, 2*hidden, p_len_limit)) # gram_size = 5*true_p_len-(0+1+2+3+4) HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a=normalize_matrix(U_a) norm_HL_1_para=normalize_matrix(HL_1_para) norm_HL_2_para=normalize_matrix(HL_2_para) norm_HL_3_para=normalize_matrix(HL_3_para) norm_HL_4_para=normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para(rng, 1, hidden_size) end_norm_U_a=normalize_matrix(end_U_a) end_norm_HL_1_para=normalize_matrix(end_HL_1_para) end_norm_HL_2_para=normalize_matrix(end_HL_2_para) end_norm_HL_3_para=normalize_matrix(end_HL_3_para) end_norm_HL_4_para=normalize_matrix(end_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, batch_size,true_p_len) span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len) end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para, end_norm_U_a, batch_size,true_p_len) end_span_scores=T.nnet.softmax(end_span_scores_matrix) #(batch, para_len) loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,0]])) end_loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,1]])) #ranking loss start tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor(index_matrix[T.arange(batch_size), gold_indices[:,0]], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega)) #ranking loss END end_tanh_span_scores_matrix = end_span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor(end_index_matrix[T.arange(batch_size), gold_indices[:,1]], 1.0) end_prob_batch_posi = end_tanh_span_scores_matrix[end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_span_scores_matrix[(1.0-end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle('x',0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean(T.maximum(0.0, margin-end_repeat_posi+end_repeat_nega)) loss = loss_neg_likelihood +end_loss_neg_likelihood+loss_rank+end_loss_rank #test test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para,norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len) mask_test_return=T.argmax(test_span_scores_matrix*para_mask, axis=1) #batch end_test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para,end_norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len) end_mask_test_return=T.argmax(end_test_span_scores_matrix*para_mask, axis=1) #batch params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]+[end_U_a,end_HL_1_para,end_HL_2_para,end_HL_3_para,end_HL_4_para] L2_reg =L2norm_paraList([embeddings,char_embeddings,conv_W_1,conv_W_2,conv_W_1_q, conv_W_2_q, conv_W_char,U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]) #L2_reg = L2norm_paraList(params) cost=loss#+L2_weight*L2_reg accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask, true_p_len], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len], [mask_test_return,end_mask_test_return], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_ids[para_id:para_id+batch_size] cost_i+= train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() pred_dict={} q_amount=0 p1=0 for test_para_id in test_batch_start: batch_predict_ids, batch_predict_end_ids=test_model( test_para_list[test_para_id:test_para_id+test_batch_size], test_Q_list[test_para_id:test_para_id+test_batch_size], test_para_mask[test_para_id:test_para_id+test_batch_size], test_Q_mask[test_para_id:test_para_id+test_batch_size], test_para_char_list[test_para_id:test_para_id+test_batch_size], test_Q_char_list[test_para_id:test_para_id+test_batch_size], test_para_char_mask[test_para_id:test_para_id+test_batch_size], test_Q_char_mask[test_para_id:test_para_id+test_batch_size], test_p_len_limit) test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size] # test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] # q_amount+=test_batch_size q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] q_amount+=test_batch_size for q in range(test_batch_size): #for each question # pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q]) start = batch_predict_ids[q] end = batch_predict_end_ids[q] if end < start: start, end = end, start pred_ans = ' '.join(test_para_wordlist_batch[q][start:end+1]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=40, cand_size=10, hidden_size=[300, 300], max_pred_pick=5): model_options = locals().copy() print "model options", model_options pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'} root = '/save/wenpeng/datasets/FEVER/' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" vocabfile = codecs.open(root + 'word2id.txt', 'r', 'utf-8') word2id = json.loads(vocabfile.read()) # co=0 # for line in vocabfile: # word2id = json.loads(line) # co+=1 # print 'co: ', co # word2id = json.load(open(root+'word2id.json')) #json.loads(vocabfile) vocabfile.close() print 'load word2id over' # train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train(sent_len, claim_len, cand_size) # train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(sent_len, claim_len, cand_size, word2id) all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, _ = load_SciTailV1_dataset( sent_len, word2id) # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, _ = load_RTE_dataset_as_test(sent_len, word2id) # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(all_sentences_l[2], dtype='int32') # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX) # dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') test_sents_r = np.asarray(all_sentences_r[2], dtype='int32') # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX) # dev_labels_store=np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(all_labels[2], dtype='int32') # dev_size=len(dev_labels_store) test_size = len(test_labels_store) vocab_size = len(word2id) + 1 print 'vocab size: ', vocab_size rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids = T.imatrix() #(batch, cand_size, sent_len) sents_mask = T.fmatrix() # sents_labels=T.imatrix() #(batch, cand_size) # claim_ids = T.imatrix() #(batch, claim_len) # claim_mask = T.fmatrix() # joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len) # joint_sents_mask=T.ftensor3() # joint_sents_labels=T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.fmatrix() labels = T.ivector() # test_premise_ids = T.imatrix() # test_premise_matrix = T.fmatrix() # test_hypo_ids = T.imatrix() # test_hypo_matrix = T.fmatrix() # test_scitail_minibatch_labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' embed_input_sents = init_embeddings[sents_ids.flatten( )].reshape((batch_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim = init_embeddings[claim_ids.flatten()].reshape( (batch_size, sent_len, emb_size)).dimshuffle(0, 2, 1) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) # task1_att_conv_W, task1_att_conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) # task1_conv_W_context, task1_conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) att_conv_W, att_conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, att_conv_W, att_conv_b, conv_W_context, conv_b_context ] ''' training task2, predict 3 labels ''' joint_embed_input_sents = init_embeddings[sents_ids.flatten( )].reshape((batch_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM joint_embed_input_claim = init_embeddings[claim_ids.flatten()].reshape( (batch_size, sent_len, emb_size)).dimshuffle(0, 2, 1) joint_conv_model_sents = Conv_with_Mask( rng, input_tensor3=joint_embed_input_sents, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_premise_emb = joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size # joint_batch_sent_emb = joint_sent_embeddings.reshape((batch_size, cand_size, hidden_size[0])) # joint_premise_emb = T.sum(joint_batch_sent_emb*joint_sents_labels.dimshuffle(0,1,'x'), axis=1) #(batch, hidden_size) joint_conv_model_claims = Conv_with_Mask( rng, input_tensor3=joint_embed_input_claim, mask_matrix=claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_claim_embeddings = joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_premise_hypo_emb = T.concatenate( [joint_premise_emb, joint_claim_embeddings], axis=1) #(batch, 2*hidden_size) ''' attentive conv in task2 ''' # joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0,2,1).reshape((batch_size, cand_size*sent_len, emb_size)) # joint_sents_dot = T.batched_dot(joint_sents_tensor3, joint_sents_tensor3.dimshuffle(0,2,1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) # joint_sents_dot_2_matrix = T.nnet.softmax(joint_sents_dot.reshape((batch_size*cand_size*sent_len, cand_size*sent_len))) # joint_sents_context = T.batched_dot(joint_sents_dot_2_matrix.reshape((batch_size, cand_size*sent_len, cand_size*sent_len)), joint_sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) # joint_add_sents_context = joint_embed_input_sents+joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= joint_embed_input_sents, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r=joint_embed_input_claim, mask_matrix=sents_mask, mask_matrix_r=claim_mask, image_shape=(batch_size, 1, emb_size, sent_len), image_shape_r=(batch_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r "Logistic Regression layer" joint_LR_input = T.concatenate([ joint_premise_hypo_emb, attentive_sent_embeddings_l, attentive_sent_embeddings_r ], axis=1) joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0] joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3) joint_LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class joint_LR_para = [joint_U_a, joint_LR_b] joint_layer_LR = LogisticRegression( rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector # joint_loss=joint_layer_LR.negative_log_likelihood(joint_labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len) joint_sents_mask=T.ftensor3() joint_sents_labels=T.imatrix() #(batch, cand_size) joint_claim_ids = T.imatrix() #(batch, claim_len) joint_claim_mask = T.fmatrix() joint_labels=T.ivector() ''' pred_minibatch_labels = joint_layer_LR.y_pred pred_minibatch_labels_2_2classes = T.where(pred_minibatch_labels > 1, 0, pred_minibatch_labels) pred_minibatch_error = T.mean( T.neq(pred_minibatch_labels_2_2classes, labels)) params = [init_embeddings] + NN_para + joint_LR_para load_model_from_file(root + 'para_for_test_scitail', params) # train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask,joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids, sents_mask, claim_ids, claim_mask, labels], pred_minibatch_error, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], pred_minibatch_error, allow_input_downcast=True, on_unused_input='ignore') # test_model = theano.function([sents_ids,sents_mask,sents_labels, claim_ids,claim_mask, joint_labels], [inter_matrix,test_layer_LR.errors(joint_labels), test_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_ids,sents_mask,sents_labels, claim_ids,claim_mask, joint_labels], [inter_matrix,test_layer_LR.errors(joint_labels), test_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... testing' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False # joint_n_train_batches=joint_train_size/batch_size # joint_train_batch_start=list(np.arange(joint_n_train_batches)*batch_size)+[joint_train_size-batch_size] # n_train_batches=train_size/batch_size # train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size remain_test_batches = test_size % batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 cost_i = 0.0 error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i test_acc = 1.0 - error_sum / (len(test_batch_start)) print '\tcurrent test_acc:', test_acc
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=100, filter_size=[3, 3], maxSentLen=40, hidden_size=[300, 300], max_term_len=4, p_mode='conc'): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1, all_word2, all_word1_mask, all_word2_mask, all_labels, all_extra, word2id = load_wordnet_hyper_vs_all_with_words( maxlen=maxSentLen, wordlen=max_term_len ) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1, test_word2, test_word1_mask, test_word2_mask, test_labels, test_extra, word2id = load_task_hyper_vs_all_with_words( LenciBenotto_file, maxSentLen, word2id, wordlen=max_term_len) store_word2id(word2id, root_dic + 'LenciBenotto_word2id.pkl') # exit(0) total_size = len(all_sentences_l) hold_test_size = 10000 train_size = total_size - hold_test_size train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32') # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32') test_sents_l = np.asarray(test_sents_l, dtype='int32') train_masks_l = np.asarray(all_masks_l[:train_size], dtype=theano.config.floatX) # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX) test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32') # dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32') test_sents_r = np.asarray(test_sents_r, dtype='int32') train_masks_r = np.asarray(all_masks_r[:train_size], dtype=theano.config.floatX) # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX) test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX) train_word1 = np.asarray(all_word1[:train_size], dtype='int32') train_word2 = np.asarray(all_word2[:train_size], dtype='int32') test_word1 = np.asarray(test_word1, dtype='int32') test_word2 = np.asarray(test_word2, dtype='int32') train_word1_mask = np.asarray(all_word1_mask[:train_size], dtype=theano.config.floatX) train_word2_mask = np.asarray(all_word2_mask[:train_size], dtype=theano.config.floatX) test_word1_mask = np.asarray(test_word1_mask, dtype=theano.config.floatX) test_word2_mask = np.asarray(test_word2_mask, dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[:train_size], dtype='int32') # dev_labels_store=np.asarray(all_labels[1], dtype='int32') # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32') test_labels_store = np.asarray(test_labels, dtype='int32') train_extra = np.asarray(all_extra[:train_size], dtype=theano.config.floatX) test_extra = np.asarray(test_extra, dtype=theano.config.floatX) # train_size=len(train_labels_store) # dev_size=len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' test size: ', test_size vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable store_model_to_file(root_dic + 'LenciBenotto_best_para_init_embeddings', [init_embeddings]) #now, start to build the input form of the model sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() word1_ids = T.imatrix() word2_ids = T.imatrix() word1_mask = T.fmatrix() word2_mask = T.fmatrix() extra = T.fvector() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) embed_input_l = embed_input( init_embeddings, sents_ids_l ) #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = embed_input( init_embeddings, sents_ids_r ) #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_word1 = init_embeddings[word1_ids.flatten()].reshape( (batch_size, word1_ids.shape[1], emb_size)) embed_word2 = init_embeddings[word2_ids.flatten()].reshape( (batch_size, word2_ids.shape[1], emb_size)) word1_embedding = T.sum(embed_word1 * word1_mask.dimshuffle(0, 1, 'x'), axis=1) word2_embedding = T.sum(embed_word2 * word2_mask.dimshuffle(0, 1, 'x'), axis=1) '''create_AttentiveConv_params ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[1], 1, emb_size, 1)) NN_para = [conv_W, conv_b, conv_W_context] ''' attentive convolution function ''' term_vs_term_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1), origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), input_tensor3=embed_word1.dimshuffle(0, 2, 1), input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), mask_matrix=word1_mask, mask_matrix_r=word2_mask, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r=(batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r p_ww = T.concatenate([ tt_embeddings_l, tt_embeddings_r, tt_embeddings_l * tt_embeddings_r, tt_embeddings_l - tt_embeddings_r ], axis=1) term_vs_def_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1), origin_input_tensor3_r=embed_input_r, input_tensor3=embed_word1.dimshuffle(0, 2, 1), input_tensor3_r=embed_input_r, mask_matrix=word1_mask, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r p_wd = T.concatenate([ td_embeddings_l, td_embeddings_r, td_embeddings_l * td_embeddings_r, td_embeddings_l - td_embeddings_r ], axis=1) def_vs_term_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), input_tensor3=embed_input_l, input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), mask_matrix=sents_mask_l, mask_matrix_r=word2_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r p_dw = T.concatenate([ dt_embeddings_l, dt_embeddings_r, dt_embeddings_l * dt_embeddings_r, dt_embeddings_l - dt_embeddings_r ], axis=1) def_vs_def_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r p_dd = T.concatenate([ dd_embeddings_l, dd_embeddings_r, dd_embeddings_l * dd_embeddings_r, dd_embeddings_l - dd_embeddings_r ], axis=1) if p_mode == 'conc': p = T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1) p_len = 4 * 4 * hidden_size[1] else: p = T.max(T.concatenate([ p_ww.dimshuffle('x', 0, 1), p_wd.dimshuffle('x', 0, 1), p_dw.dimshuffle('x', 0, 1), p_dd.dimshuffle('x', 0, 1) ], axis=0), axis=0) p_len = 4 * hidden_size[1] # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1) # HL_input_size=p_len+1+1 # # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh) "form input to LR classifier" LR_input = T.concatenate([ p, cosine_matrix1_matrix2_rowwise(word1_embedding, word2_embedding).dimshuffle(0, 'x'), extra.dimshuffle(0, 'x') ], axis=1) LR_input_size = p_len + 1 + 1 # LR_input = HL_layer_1.output # LR_input_size = hidden_size[1] U_a = create_ensemble_para( rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b, bias=0.25 ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum() params = NN_para + LR_para #[init_embeddings] cost = loss #+L2_weight*L2_reg updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function([ sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids, word2_ids, word1_mask, word2_mask, extra, labels ], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([ sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids, word2_ids, word1_mask, word2_mask, extra, labels ], [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size if n_test_remain != 0: test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] else: test_batch_start = list(np.arange(n_test_batches) * batch_size) # max_acc_dev=0.0 max_ap_test = 0.0 max_ap_topk_test = 0.0 max_f1 = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model( train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_word1[train_id_batch], train_word2[train_id_batch], train_word1_mask[train_id_batch], train_word2_mask[train_id_batch], train_extra[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_labels = [] probs = [] gold_labels = [] error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i, pred_i, prob_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_word1[test_batch_id:test_batch_id + batch_size], test_word2[test_batch_id:test_batch_id + batch_size], test_word1_mask[test_batch_id:test_batch_id + batch_size], test_word2_mask[test_batch_id:test_batch_id + batch_size], test_extra[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i pred_labels += list(pred_i) probs += list(prob_i) if n_test_remain != 0: probs = probs[:(len(test_batch_start) - 1) * batch_size] + probs[-n_test_remain:] assert len(test_labels) == len(probs) # test_acc=1.0-error_sum/(len(test_batch_start)) test_ap = apk(test_labels, probs, k=len(test_labels)) test_ap_top100 = apk(test_labels, probs, k=100) if test_ap > max_ap_test: max_ap_test = test_ap store_model_to_file( root_dic + 'LenciBenotto_best_para_' + str(max_ap_test), params) if test_ap_top100 > max_ap_topk_test: max_ap_topk_test = test_ap_top100 print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=70, filter_size=[3], maxSentLen=40, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(all_sentences_l[2], dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(all_sentences_r[2], dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(all_labels[2], dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered" "words keep random initialization" rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' 'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in' 'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) ' embed_input_l = init_embeddings[sents_ids_l.flatten( )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) '''create parameters for attentive convolution function ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [conv_W, conv_b, conv_W_context] ''' attentive convolution function ''' attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r "Logistic Regression layer" LR_input = T.concatenate([ attentive_sent_embeddings_l, attentive_sent_embeddings_r, attentive_sent_embeddings_l + attentive_sent_embeddings_r, attentive_sent_embeddings_l * attentive_sent_embeddings_r ], axis=1) LR_input_size = 4 * hidden_size[0] U_a = create_ensemble_para(rng, 3, LR_input_size) # (input_size, 3) LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=normalize_matrix_col_wise(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [init_embeddings] + NN_para + LR_para cost = loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0): if iter % 1000 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() ''' test ''' error_sum = 0.0 for test_batch_id in test_batch_start: # for each test batch error_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i test_acc = 1.0 - error_sum / (len(test_batch_start)) if test_acc > max_acc_test: max_acc_test = test_acc print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=10, hidden_size=10, L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=0.217545454546): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist= load_dev_or_test(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in overall_word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' norm_extraF=normalize_matrix(extraF) U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size) Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size+3) # 3 extra features LR_b = theano.shared(value=numpy.zeros((2,), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras=[W_a1, W_a2, U_a, LR_b] params = [embeddings]+paragraph_para+Q_para+attention_paras load_model_from_file(rootPath+'Best_Paras_conv_0.217545454545', params) paragraph_input = embeddings[paragraph.flatten()].reshape((paragraph.shape[0], paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1) paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) para_reps=paragraph_model.output_tensor #(batch, emb, para_len) # #LSTM # fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters # paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) # para_reps=paragraph_model.output_tensor Qs_emb = embeddings[questions.flatten()].reshape((questions.shape[0], questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size) questions_reps_tensor=questions_model.output_tensor #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor #use CNN for question modeling # Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen) # conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5)) # Q_conv_para=[conv_W, conv_b] # conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4, # image_shape=(batch_size, 1, emb_size, q_len_limit), # filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b) # conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1) # gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1)) # masked_conv_output=conv_output*gru_mask # questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size)) # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) # def example_in_batch(para_matrix, q_matrix): #assume both are (hidden, len) transpose_para_matrix=para_matrix.T interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix=T.nnet.softmax(interaction_matrix) return T.dot(q_matrix, norm_interaction_matrix.T) #(len, para_len) batch_q_reps, updates = theano.scan(fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor]) #batch_q_reps (batch, hidden, para_len) #attention distributions norm_W_a1=normalize_matrix(W_a1) norm_W_a2=normalize_matrix(W_a2) norm_U_a=normalize_matrix(U_a) transformed_para_reps=T.maximum(T.dot(para_reps.transpose((0, 2,1)), norm_W_a2),0.0) #relu transformed_q_reps=T.maximum(T.dot(batch_q_reps.transpose((0, 2,1)), norm_W_a1),0.0) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both=transformed_para_reps+transformed_q_reps # U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size) # U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size) # accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b] # accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b) # accu_both=accumu_model.output_tensor.transpose((0,2,1)) prior_att=T.concatenate([add_both, norm_extraF], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices=para_mask.flatten().nonzero()[0] layer3=LogisticRegression(rng, input=prior_att.reshape((batch_size*prior_att.shape[1], hidden_size+3)), n_in=hidden_size+3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.sum(T.log(layer3.p_y_given_x)[valid_indices, labels.flatten()[valid_indices]])#[T.arange(y.shape[0]), y]) distributions=layer3.p_y_given_x[:,-1].reshape((batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) # masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask masked_dis=distributions*para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=error#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([paragraph, questions,labels, para_mask, q_mask, extraF], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( np.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), np.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), np.asarray([train_label_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), np.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), np.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), np.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX)) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() exact_match=0.0 F1_match=0.0 q_amount=0 for test_para_id in test_batch_start: distribution_matrix=test_model( np.asarray(test_para_list[test_para_id:test_para_id+batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id+batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX), np.asarray(test_feature_matrixlist[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+batch_size] para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+batch_size] paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] para_len=len(test_para_wordlist_list[0]) if para_len!=len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount+=batch_size # print q_size # print test_para_word_list Q_list_inword=test_Q_list_word[test_para_id:test_para_id+batch_size] for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) q_gold_ans_set=para_gold_ansset_list[q] # print test_para_wordlist_list[q] # print Q_list_inword[q] # print pred_ans.encode('utf8'), q_gold_ans_set if pred_ans in q_gold_ans_set: exact_match+=1 F1=MacroF1(pred_ans, q_gold_ans_set) F1_match+=F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc=F1_match/q_amount exact_acc=exact_match/q_amount if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Paras_conv_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=40, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_sents = np.asarray(all_sentences[0], dtype='int32') train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels = np.asarray(all_labels[0], dtype='int32') train_size = len(train_labels) dev_sents = np.asarray(all_sentences[1], dtype='int32') dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels = np.asarray(all_labels[1], dtype='int32') dev_size = len(dev_labels) test_sents = np.asarray(all_sentences[2], dtype='int32') test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels = np.asarray(all_labels[2], dtype='int32') test_size = len(test_labels) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec', emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec' ], 40) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) # bow_mean_emb = bow_emb/T.sum(sents_mask,axis=1).dimshuffle(0,'x') des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] # NN_para = multiCNN_para+ACNN_para conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb], axis=1) LR_input_size = hidden_size[0] * 2 + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 12, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(layer_LR.before_softmax) #batch * 12 prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix) loss = -T.mean(T.log(prob_pos)) ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1) LR_att_input_size = hidden_size[0] + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_att_a = create_ensemble_para( rng, 12, LR_att_input_size) # the weight matrix hidden_size*2 LR_att_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_att_para = [U_att_a, LR_att_b] layer_att_LR = LogisticRegression( rng, input=LR_att_input, n_in=LR_att_input_size, n_out=12, W=U_att_a, b=LR_att_b ) #basically it is a multiplication between weight matrix and input feature vector att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax) #batch * 12 att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix, att_score_matrix) att_loss = -T.mean(T.log(att_prob_pos)) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l acnn_LR_input = T.concatenate( [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1) acnn_LR_input_size = hidden_size[0] * 2 + emb_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a = create_ensemble_para( rng, 12, acnn_LR_input_size) # the weight matrix hidden_size*2 acnn_LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para # put all model parameters together cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) ''' testing ''' ensemble_NN_scores = T.max(T.concatenate([ att_score_matrix.dimshuffle('x', 0, 1), score_matrix.dimshuffle('x', 0, 1), acnn_score_matrix.dimshuffle('x', 0, 1) ], axis=0), axis=0) ensemble_scores = 0.5 * ensemble_NN_scores + 0.5 * cosine_score_matrix binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] # max_acc_dev=0.0 max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) cost_i = 0.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] # print 'pred_labels:', pred_labels # print 'gold_labels;', gold_labels all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col( all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.5, n_epochs=2000, batch_size=500, emb_size=300, hidden_size=300, L2_weight=0.0001, para_len_limit=700, q_len_limit=40): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist= load_dev_or_test(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) id2word = {y:x for x,y in overall_word2id.iteritems()} word2vec=load_word2vec() rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') labels = T.imatrix('labels') para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) paragraph_input = embeddings[paragraph.flatten()].reshape((paragraph.shape[0], paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) # # # BdGRU(rng, str(0), shape, X, mask, is_train = 1, batch_size = 1, p = 0.5) # U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) para_reps=paragraph_model.output_tensor #(batch, emb, para_len) Qs_emb = embeddings[questions.flatten()].reshape((questions.shape[0], questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size) Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size) #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #attention distributions W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 2, hidden_size+3) # 3 extra features norm_W_a1=normalize_matrix(W_a1) norm_W_a2=normalize_matrix(W_a2) norm_U_a=normalize_matrix(U_a) LR_b = theano.shared(value=numpy.zeros((2,), dtype=theano.config.floatX), # @UndefinedVariable name='LR_b', borrow=True) attention_paras=[W_a1, W_a2, U_a, LR_b] transformed_para_reps=T.tanh(T.dot(para_reps.transpose((0, 2,1)), norm_W_a2)) transformed_q_reps=T.tanh(T.dot(questions_reps, norm_W_a1)) #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1) add_both=0.5*(transformed_para_reps+transformed_q_reps) prior_att=T.concatenate([add_both, normalize_matrix(extraF)], axis=2) #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2) valid_indices=para_mask.flatten().nonzero()[0] layer3=LogisticRegression(rng, input=prior_att.reshape((batch_size*prior_att.shape[1], hidden_size+3)), n_in=hidden_size+3, n_out=2, W=norm_U_a, b=LR_b) #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices]) error = -T.mean(T.log(layer3.p_y_given_x)[valid_indices, labels.flatten()[valid_indices]])#[T.arange(y.shape[0]), y]) distributions=layer3.p_y_given_x[:,-1].reshape((batch_size, para_mask.shape[1])) #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1])) masked_dis=distributions*para_mask ''' strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1) distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions') para_mask=para_mask masked_dis=distributions*para_mask # masked_label=debug_print(labels*para_mask, 'masked_label') # error=((masked_dis-masked_label)**2).mean() label_mask=T.gt(labels,0.0) neg_label_mask=T.lt(labels,0.0) dis_masked=distributions*label_mask remain_dis_masked=distributions*neg_label_mask ans_size=T.sum(label_mask) non_ans_size=T.sum(neg_label_mask) pos_error=T.sum((dis_masked-label_mask)**2)/ans_size neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)* ''' # def AttentionLayer(q_rep, ext_M): # theano_U_a=debug_print(norm_U_a, 'norm_U_a') # prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att') # f __name__ == '__main__': # prior_att=T.concatenate([prior_att, ext_M], axis=1) # # strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1) # return strength.transpose() #(1, #words) # distributions, updates = theano.scan( # AttentionLayer, # sequences=[questions_reps,extraF] ) # distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions') # labels=debug_print(labels, 'labels') # label_mask=T.gt(labels,0.0) # neg_label_mask=T.lt(labels,0.0) # dis_masked=distributions*label_mask # remain_dis_masked=distributions*neg_label_mask # pos_error=((dis_masked-1)**2).mean() # neg_error=((remain_dis_masked-(-1))**2).mean() # error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = [embeddings]+paragraph_para+Q_para+attention_paras L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=error#+L2_weight*L2_reg accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([paragraph, questions,labels, para_mask, q_mask, extraF], error, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_exact_acc=0.0 cost_i=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #shuffle(train_batch_start) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( np.asarray(train_para_list[para_id:para_id+batch_size], dtype='int32'), np.asarray(train_Q_list[para_id:para_id+batch_size], dtype='int32'), np.asarray(train_label_list[para_id:para_id+batch_size], dtype='int32'), np.asarray(train_para_mask[para_id:para_id+batch_size], dtype=theano.config.floatX), np.asarray(train_mask[para_id:para_id+batch_size], dtype=theano.config.floatX), np.asarray(train_feature_matrixlist[para_id:para_id+batch_size], dtype=theano.config.floatX)) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() exact_match=0.0 q_amount=0 for test_para_id in test_batch_start: distribution_matrix=test_model( np.asarray(test_para_list[test_para_id:test_para_id+batch_size], dtype='int32'), np.asarray(test_Q_list[test_para_id:test_para_id+batch_size], dtype='int32'), np.asarray(test_para_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX), np.asarray(test_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX), np.asarray(test_feature_matrixlist[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+batch_size] para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+batch_size] paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] para_len=len(test_para_wordlist_list[0]) if para_len!=len(distribution_matrix[0]): print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) exit(0) # q_size=len(distribution_matrix) q_amount+=batch_size # print q_size # print test_para_word_list for q in range(batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q]) q_gold_ans_set=para_gold_ansset_list[q] F1=MacroF1(pred_ans, q_gold_ans_set) exact_match+=F1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) exact_acc=exact_match/q_amount if exact_acc> max_exact_acc: max_exact_acc=exact_acc print 'current average F1:', exact_acc, '\t\tmax F1:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(term1_str, term2_str): emb_size=300 filter_size=[3,3] maxSentLen=40 hidden_size=[300,300] max_term_len=4 p_mode = 'conc' batch_size = 1 term1_def, source1 = load_concept_def(term1_str) print '\n',term1_str, ':\t', term1_def,'\t', source1,'\n' term2_def, source2 = load_concept_def(term2_str) print '\n',term2_str, ':\t', term2_def, '\t', source2,'\n' # exit(0) word2id = load_word2id('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_word2id.pkl') seed=1234 np.random.seed(seed) rng = np.random.RandomState(seed) #random seed, control the model generates the same results # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id = load_EVAlution_hyper_vs_all_with_words(maxSentLen, word2id, wordlen=max_term_len) test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask, test_extra, word2id = parse_individual_termPair(term1_str, term2_str, term1_def, term2_def, maxSentLen, word2id, wordlen=max_term_len) # total_size = len(all_sentences_l) # hold_test_size = 10000 # train_size = total_size - hold_test_size # train_sents_l=np.asarray(all_sentences_l[:train_size], dtype='int32') # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32') test_sents_l=np.asarray(test_sents_l, dtype='int32') # train_masks_l=np.asarray(all_masks_l[:train_size], dtype=theano.config.floatX) # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX) test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX) # train_sents_r=np.asarray(all_sentences_r[:train_size], dtype='int32') # dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32') test_sents_r=np.asarray(test_sents_r, dtype='int32') # train_masks_r=np.asarray(all_masks_r[:train_size], dtype=theano.config.floatX) # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX) test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX) # train_word1=np.asarray(all_word1[:train_size], dtype='int32') # train_word2=np.asarray(all_word2[:train_size], dtype='int32') test_word1=np.asarray(test_word1, dtype='int32') test_word2=np.asarray(test_word2, dtype='int32') # train_word1_mask=np.asarray(all_word1_mask[:train_size], dtype=theano.config.floatX) # train_word2_mask=np.asarray(all_word2_mask[:train_size], dtype=theano.config.floatX) test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX) test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX) # train_labels_store=np.asarray(all_labels[:train_size], dtype='int32') # dev_labels_store=np.asarray(all_labels[1], dtype='int32') # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32') # test_labels_store=np.asarray(test_labels, dtype='int32') # train_extra=np.asarray(all_extra[:train_size], dtype=theano.config.floatX) test_extra=np.asarray(test_extra, dtype=theano.config.floatX) # train_size=len(train_labels_store) # dev_size=len(dev_labels_store) test_size=len(test_extra) print ' test size: ', len(test_extra) vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable # store_model_to_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_embeddings', [init_embeddings]) # exit(0) #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() word1_ids = T.imatrix() word2_ids = T.imatrix() word1_mask = T.fmatrix() word2_mask = T.fmatrix() extra = T.fvector() # labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size)) embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size)) word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1) word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1) '''create_AttentiveConv_params ''' conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1)) NN_para=[conv_W, conv_b,conv_W_context] ''' attentive convolution function ''' term_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = word1_mask, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1) term_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_input_r, input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_input_r, mask_matrix = word1_mask, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1) def_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_input_l, input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = sents_mask_l, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1) def_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_input_r, input_tensor3=embed_input_l, input_tensor3_r = embed_input_r, mask_matrix = sents_mask_l, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1) if p_mode == 'conc': p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1) p_len = 4*4*hidden_size[1] else: p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0) p_len =4*hidden_size[1] "form input to LR classifier" LR_input = T.concatenate([p,extra.dimshuffle(0,'x')],axis=1) LR_input_size=p_len+1 U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector params = NN_para+LR_para #[init_embeddings] load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_embeddings', [init_embeddings]) load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params) test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... testing' n_test_batches=test_size/batch_size n_test_remain = test_size%batch_size if n_test_remain!=0: test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] else: test_batch_start=list(np.arange(n_test_batches)*batch_size) # max_acc_dev=0.0 # max_ap_test=0.0 # max_ap_topk_test=0.0 # max_f1=0.0 # cost_i=0.0 # train_indices = range(train_size) for idd, test_batch_id in enumerate(test_batch_start): # for each test batch pred_i, prob_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_word1[test_batch_id:test_batch_id+batch_size], test_word2[test_batch_id:test_batch_id+batch_size], test_word1_mask[test_batch_id:test_batch_id+batch_size], test_word2_mask[test_batch_id:test_batch_id+batch_size], test_extra[test_batch_id:test_batch_id+batch_size]) print pred_i, prob_i
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=500, test_batch_size=500, emb_size=10, hidden_size=10, L2_weight=0.0001, margin=0.5, train_size=4000000, test_size=1000, max_context_len=25, max_span_len=7, max_q_len=40, max_EM=0.0): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = np.random.RandomState(23455) word2id,train_questions,train_questions_mask,train_lefts,train_lefts_mask,train_spans,train_spans_mask,train_rights,train_rights_mask=load_SQUAD_hinrich(train_size, max_context_len, max_span_len, max_q_len) test_ground_truth,all_candidates_f1,test_questions,test_questions_mask,test_lefts,test_lefts_mask,test_spans,test_spans_mask,test_rights,test_rights_mask=load_dev_hinrich(word2id, test_size, max_context_len, max_span_len, max_q_len) overall_vocab_size=len(word2id) print 'vocab size:', overall_vocab_size rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, np.random.RandomState(1234)) # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() left=T.imatrix() #(2*batch, len) left_mask=T.fmatrix() #(2*batch, len) span=T.imatrix() #(2*batch, span_len) span_mask=T.fmatrix() #(2*batch, span_len) right=T.imatrix() #(2*batch, len) right_mask=T.fmatrix() #(2*batch, len) q=T.imatrix() #(2*batch, len_q) q_mask=T.fmatrix() #(2*batch, len_q) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) GRU1_para=[U1, W1, b1, U1_b, W1_b, b1_b] U2, W2, b2=create_GRU_para(rng, hidden_size, hidden_size) U2_b, W2_b, b2_b=create_GRU_para(rng, hidden_size, hidden_size) GRU2_para=[U2, W2, b2, U2_b, W2_b, b2_b] W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) attend_para=[W_a1, W_a2] params = [embeddings]+GRU1_para+attend_para+GRU2_para # load_model_from_file(rootPath+'Best_Para_dim'+str(emb_size), params) left_input = embeddings[left.flatten()].reshape((left.shape[0], left.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context) span_input = embeddings[span.flatten()].reshape((span.shape[0], span.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_span) right_input = embeddings[right.flatten()].reshape((right.shape[0], right.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context) q_input = embeddings[q.flatten()].reshape((q.shape[0], q.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_q) left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=left_input, Mask=left_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) left_reps=left_model.output_tensor #(batch, emb, para_len) span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=span_input, Mask=span_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) span_reps=span_model.output_tensor #(batch, emb, para_len) right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=right_input, Mask=right_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) right_reps=right_model.output_tensor #(batch, emb, para_len) q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) q_reps=q_model.output_tensor #(batch, emb, para_len) #interaction left_reps_via_q_reps, q_reps_via_left_reps=attention_dot_prod_between_2tensors(left_reps, q_reps) span_reps_via_q_reps, q_reps_via_span_reps=attention_dot_prod_between_2tensors(span_reps, q_reps) right_reps_via_q_reps, q_reps_via_right_reps=attention_dot_prod_between_2tensors(right_reps, q_reps) # q_reps_via_left_reps=attention_dot_prod_between_2tensors(q_reps, left_reps) # q_reps_via_span_reps=attention_dot_prod_between_2tensors(q_reps, span_reps) # q_reps_via_right_reps=attention_dot_prod_between_2tensors(q_reps, right_reps) #combine origin_W=normalize_matrix(W_a1) attend_W=normalize_matrix(W_a2) left_origin_reps=T.dot(left_reps.dimshuffle(0, 2,1), origin_W) span_origin_reps=T.dot(span_reps.dimshuffle(0, 2,1), origin_W) right_origin_reps=T.dot(right_reps.dimshuffle(0, 2,1), origin_W) q_origin_reps=T.dot(q_reps.dimshuffle(0, 2,1), origin_W) left_attend_q_reps=T.dot(q_reps_via_left_reps.dimshuffle(0, 2,1), attend_W) span_attend_q_reps=T.dot(q_reps_via_span_reps.dimshuffle(0, 2,1), attend_W) right_attend_q_reps=T.dot(q_reps_via_right_reps.dimshuffle(0, 2,1), attend_W) q_attend_left_reps=T.dot(left_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) q_attend_span_reps=T.dot(span_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) q_attend_right_reps=T.dot(right_reps_via_q_reps.dimshuffle(0, 2,1), attend_W) add_left=left_origin_reps+q_attend_left_reps #(2*batch, len ,hidden) add_span=span_origin_reps+q_attend_span_reps add_right=right_origin_reps+q_attend_right_reps add_q_by_left=q_origin_reps+left_attend_q_reps add_q_by_span=q_origin_reps+span_attend_q_reps add_q_by_right=q_origin_reps+right_attend_q_reps #second GRU add_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_left.dimshuffle(0,2,1), Mask=left_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_left_reps=add_left_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_span.dimshuffle(0,2,1), Mask=span_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_span_reps=add_span_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_right.dimshuffle(0,2,1), Mask=right_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_right_reps=add_right_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_left.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_left_reps=add_q_by_left_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_span.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_span_reps=add_q_by_span_model.output_sent_rep_maxpooling #(batch, hidden_dim) add_q_by_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_right.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b) add_q_by_right_reps=add_q_by_right_model.output_sent_rep_maxpooling #(batch, hidden_dim) paragraph_concat=T.concatenate([add_left_reps, add_span_reps, add_right_reps], axis=1) #(batch, 3*hidden) question_concat=T.concatenate([add_q_by_left_reps, add_q_by_span_reps, add_q_by_right_reps], axis=1) #(batch, 3*hidden) simi_list=cosine_row_wise_twoMatrix(paragraph_concat, question_concat) #(2*batch) pos_simi_vec=simi_list[::2] neg_simi_vec=simi_list[1::2] raw_loss=T.maximum(0.0, margin+neg_simi_vec-pos_simi_vec) #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=T.sum(raw_loss)#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], simi_list, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size #batch_size means how many pairs remain_train=train_size%batch_size # train_batch_start=list(np.arange(n_train_batches)*batch_size*2)+[train_size*2-batch_size*2] # always ou shu if remain_train>0: train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] else: train_batch_start=list(np.arange(n_train_batches)*batch_size) max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_odd_ids = list(np.arange(train_size)*2) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_odd_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 if iter%100==0: print 'iter:', iter iter_accu+=1 train_id_list=[[train_odd_id, train_odd_id+1] for train_odd_id in train_odd_ids[para_id:para_id+batch_size]] train_id_list=sum(train_id_list,[]) # print train_id_list cost_i+= train_model( np.asarray([train_lefts[id] for id in train_id_list], dtype='int32'), np.asarray([train_lefts_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_spans[id] for id in train_id_list], dtype='int32'), np.asarray([train_spans_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_rights[id] for id in train_id_list], dtype='int32'), np.asarray([train_rights_mask[id] for id in train_id_list], dtype=theano.config.floatX), np.asarray([train_questions[id] for id in train_id_list], dtype='int32'), np.asarray([train_questions_mask[id] for id in train_id_list], dtype=theano.config.floatX)) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() exact_match=0.0 F1_match=0.0 for test_pair_id in range(test_size): test_example_lefts=test_lefts[test_pair_id] test_example_lefts_mask=test_lefts_mask[test_pair_id] test_example_spans=test_spans[test_pair_id] test_example_spans_mask=test_spans_mask[test_pair_id] test_example_rights=test_rights[test_pair_id] test_example_rights_mask=test_rights_mask[test_pair_id] test_example_questions=test_questions[test_pair_id] test_example_questions_mask=test_questions_mask[test_pair_id] test_example_candidates_f1=all_candidates_f1[test_pair_id] test_example_size=len(test_example_lefts) # print 'test_pair_id, test_example_size:', test_pair_id, test_example_size if test_example_size < test_batch_size: #pad pad_size=test_batch_size-test_example_size test_example_lefts+=test_example_lefts[-1:]*pad_size test_example_lefts_mask+=test_example_lefts_mask[-1:]*pad_size test_example_spans+=test_example_spans[-1:]*pad_size test_example_spans_mask+=test_example_spans_mask[-1:]*pad_size test_example_rights+=test_example_rights[-1:]*pad_size test_example_rights_mask+=test_example_rights_mask[-1:]*pad_size test_example_questions+=test_example_questions[-1:]*pad_size test_example_questions_mask+=test_example_questions_mask[-1:]*pad_size test_example_candidates_f1+=test_example_candidates_f1[-1:]*pad_size test_example_size=test_batch_size n_test_batches=test_example_size/test_batch_size n_test_remain=test_example_size%test_batch_size if n_test_remain > 0: test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_example_size-test_batch_size] else: test_batch_start=list(np.arange(n_test_batches)*test_batch_size) all_simi_list=[] all_cand_list=[] for test_para_id in test_batch_start: simi_return_vector=test_model( np.asarray(test_example_lefts[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_lefts_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_spans[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_spans_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_rights[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_rights_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), np.asarray(test_example_questions[test_para_id:test_para_id+test_batch_size], dtype='int32'), np.asarray(test_example_questions_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX)) candidate_f1_list=test_example_candidates_f1[test_para_id:test_para_id+test_batch_size] all_simi_list+=list(simi_return_vector) all_cand_list+=candidate_f1_list top1_f1=all_cand_list[np.argsort(all_simi_list)[-1]] # print top1_cand, test_ground_truth[test_pair_id] if top1_f1 == 1.0: exact_match+=1 # F1=macrof1(top1_cand, test_ground_truth[test_pair_id]) # print '\t\t\t', F1 F1_match+=top1_f1 # match_amount=len(pred_ans_set & q_gold_ans_set) # # print 'q_gold_ans_set:', q_gold_ans_set # # print 'pred_ans_set:', pred_ans_set # if match_amount>0: # exact_match+=match_amount*1.0/len(pred_ans_set) F1_acc=F1_match/test_size exact_acc=exact_match/test_size if F1_acc> max_F1_acc: max_F1_acc=F1_acc # store_model_to_file(params, emb_size) if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Para_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=40, batch_size=50, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) all_sentences, all_masks, all_labels, word2id = load_reliefweb_il5_12_multilabel( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence train_sents = np.asarray(all_sentences[0], dtype='int32') train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels = np.asarray(all_labels[0], dtype='int32') train_size = len(train_labels) # dev_sents=all_sentences[1] # dev_masks=all_masks[1] # dev_labels=all_labels[1] # dev_size=len(dev_labels) test_sents = np.asarray(all_sentences[2], dtype='int32') test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels = np.asarray(all_labels[2], dtype='int32') test_size = len(test_labels) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec', emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec' ], 40) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) # conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) # conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) # NN_para = [conv_W, conv_b, conv_W2, conv_b2] # conv_model = Conv_with_Mask(rng, input_tensor3=common_input, # mask_matrix = sents_mask, # image_shape=(batch_size, 1, emb_size, maxSentLen), # filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero # sent_embeddings=conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size # # conv_model2 = Conv_with_Mask(rng, input_tensor3=common_input, # mask_matrix = sents_mask, # image_shape=(batch_size, 1, emb_size, maxSentLen), # filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2) #mutiple mask with the conv_out to set the features by UNK to zero # sent_embeddings2=conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size # # LR_input = T.concatenate([sent_embeddings,sent_embeddings2, bow_emb], axis=1) # LR_input_size = hidden_size[0]*2+emb_size # #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative # U_a = create_ensemble_para(rng, 8, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class # LR_para=[U_a, LR_b] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector # score_matrix = T.nnet.sigmoid(layer_LR.before_softmax) #batch * 8 # sub_labels = labels[:,:8] # prob_pos = T.where( sub_labels < 1, 1.0-score_matrix, score_matrix) # loss = -T.mean(T.log(prob_pos)) ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.maxpooling_vec #gru_layer.output_sent_rep # (batch_size, hidden_size) LR_att_input = gru_sent_embeddings #T.concatenate([gru_sent_embeddings, bow_emb], axis=1) LR_att_input_size = hidden_size[0] #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_att_a = create_ensemble_para( rng, 8, LR_att_input_size) # the weight matrix hidden_size*2 LR_att_b = theano.shared(value=np.zeros((8, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_att_para = [U_att_a, LR_att_b] layer_att_LR = LogisticRegression( rng, input=LR_att_input, n_in=LR_att_input_size, n_out=8, W=U_att_a, b=LR_att_b ) #basically it is a multiplication between weight matrix and input feature vector att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax) #batch * 12 sub_labels = labels[:, :8] att_prob_pos = T.where(sub_labels < 1, 1.0 - att_score_matrix, att_score_matrix) att_loss = -T.mean(T.log(att_prob_pos)) params = GRU_NN_para + LR_att_para # put all model parameters together cost = att_loss #+Div_reg*diversify_reg#+L2_weight*L2_reg updates = Gradient_Cost_Para(cost, params, learning_rate) ''' testing ''' binarize_prob = T.where(att_score_matrix > 0.3, 1, 0) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_id_matrix, sents_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_meanf1_test = 0.0 max_weightf1_test = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) iter_accu = 0 cost_i = 0.0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] # labels_matrix = [] # for lab in train_labels[train_id_batch]: # vec = [0]*8 # vec[lab]=1 # labels_matrix.append(vec) # labels_matrix = np.asarray(labels_matrix, dtype='int32') cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() # error_sum=0.0 # for test_batch_id in test_batch_start: # for each test batch # error_i, pred_labels=test_model( # test_sents[test_batch_id:test_batch_id+batch_size], # test_masks[test_batch_id:test_batch_id+batch_size]) # pred_labels=list(pred_labels) # error_sum+=error_i # # test_accuracy=1.0-error_sum/(len(test_batch_start)) # if test_accuracy > max_acc_test: # max_acc_test=test_accuracy # print '\t\t\t\t\t\t\t\tcurrent testbacc:', test_accuracy, '\t\tmax_acc_test:', max_acc_test error_sum = 0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size]) gold_labels = test_labels[test_batch_id:test_batch_id + batch_size] # gold_labels_matrix = [] # for lab in gold_labels: # vec = [0]*8 # vec[lab]=1 # gold_labels_matrix.append(vec) # gold_labels_matrix = np.asarray(gold_labels_matrix, dtype='int32') all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 = average_f1_two_array_by_col_firstK( all_pred_labels, all_gold_labels, 8) if test_weight_f1 > max_weightf1_test: max_weightf1_test = test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test = test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=100, filter_size=[3,3], maxSentLen=40, hidden_size=[300,300], max_term_len=4, p_mode = 'conc'): model_options = locals().copy() print "model options", model_options seed=1234 np.random.seed(seed) rng = np.random.RandomState(seed) #random seed, control the model generates the same results # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) word2id = load_word2id(root_dic+'LenciBenotto_word2id.pkl') test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id, group_size_list = load_task_hyper_vs_all_with_allDefComb(LenciBenotto_file,maxSentLen, word2id, wordlen=max_term_len) test_sents_l=np.asarray(test_sents_l, dtype='int32') test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX) test_sents_r=np.asarray(test_sents_r, dtype='int32') test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX) test_word1=np.asarray(test_word1, dtype='int32') test_word2=np.asarray(test_word2, dtype='int32') test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX) test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX) test_labels_store=np.asarray(test_labels, dtype='int32') test_extra=np.asarray(test_extra, dtype=theano.config.floatX) # train_size=len(train_labels_store) # dev_size=len(dev_labels_store) test_size=len(test_sents_l) print ' test size: ', test_size vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable # load_model_from_file(root_dic+'Weeds_best_para_init_embeddings', [init_embeddings]) #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() word1_ids = T.imatrix() word2_ids = T.imatrix() word1_mask = T.fmatrix() word2_mask = T.fmatrix() extra = T.fvector() labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size)) embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size)) word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1) word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1) '''create_AttentiveConv_params ''' conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1)) NN_para=[conv_W, conv_b,conv_W_context] ''' attentive convolution function ''' term_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = word1_mask, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1) term_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_input_r, input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_input_r, mask_matrix = word1_mask, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1) def_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_input_l, input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = sents_mask_l, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1) def_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_input_r, input_tensor3=embed_input_l, input_tensor3_r = embed_input_r, mask_matrix = sents_mask_l, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1) if p_mode == 'conc': p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1) p_len = 4*4*hidden_size[1] else: p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0) p_len =4*hidden_size[1] # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1) # HL_input_size=p_len+1+1 # # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh) "form input to LR classifier" LR_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1) LR_input_size=p_len+1+1 # LR_input = HL_layer_1.output # LR_input_size = hidden_size[1] U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b, bias=0.25) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum() params = NN_para+LR_para #[init_embeddings] # load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params) load_model_from_file(root_dic+'LenciBenotto_best_para_0.557286573332', params) ''' 0.552587544259; current ap: 0.574037513126 ap@100 0.918481316424 0.557286573332; current ap: 0.576498645289 ap@100 0.909032657538 ''' test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_test_batches=test_size/batch_size n_test_remain = test_size%batch_size if n_test_remain!=0: test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] else: test_batch_start=list(np.arange(n_test_batches)*batch_size) # max_acc_dev=0.0 max_ap_test=0.0 max_ap_topk_test=0.0 max_f1=0.0 pred_labels =[] probs = [] gold_labels =[] error_sum=0.0 for idd, test_batch_id in enumerate(test_batch_start): # for each test batch pred_i, prob_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_word1[test_batch_id:test_batch_id+batch_size], test_word2[test_batch_id:test_batch_id+batch_size], test_word1_mask[test_batch_id:test_batch_id+batch_size], test_word2_mask[test_batch_id:test_batch_id+batch_size], test_extra[test_batch_id:test_batch_id+batch_size]) # error_sum+=error_i pred_labels+=list(pred_i) probs+=list(prob_i) print len(test_sents_l), len(probs) if n_test_remain !=0: probs = probs[:(len(test_batch_start)-1)*batch_size]+probs[-n_test_remain:] print len(test_sents_l), len(probs) assert len(test_sents_l) == len(probs) assert sum(group_size_list) == len(probs) #max prob in group max_probs = [] prior_size = 0 for i in range(len(group_size_list)): sub_probs = probs[prior_size:prior_size+group_size_list[i]] prior_size += group_size_list[i] max_probs.append(max(sub_probs)) print len(group_size_list),len(max_probs),len(test_labels) assert len(test_labels) == len(max_probs) # test_acc=1.0-error_sum/(len(test_batch_start)) test_ap = apk(test_labels, max_probs, k=len(test_labels)) test_ap_top100 = apk(test_labels, max_probs, k=100) # if test_ap > max_ap_test: # max_ap_test=test_ap # store_model_to_file('/save/wenpeng/datasets/EVALution/HyperDef_label_4ways_conc_test_on_EVA_allDefComb_best_para_'+str(max_ap_test), params) # if test_ap_top100 > max_ap_topk_test: # max_ap_topk_test=test_ap_top100 print '\t\tcurrent ap:', test_ap,'ap@100', test_ap_top100
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=0.0000001, extra_size=4, emb_size=300, batch_size=50, filter_size=[3, 5], maxSentLen=60, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id = load_NYT_dataset( maxSentLen, word2id) train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(test_sents_l, dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(test_sents_r, dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(test_labels, dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) embed_input_l = embed_input( init_embeddings, sents_ids_l ) #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = embed_input( init_embeddings, sents_ids_r ) #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) '''create_AttentiveConv_params ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1)) conv_W_2, conv_b_2 = create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[1])) conv_W_context_2, conv_b_context_2 = create_conv_para( rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1)) NN_para = [ conv_W, conv_b, conv_W_context, conv_W_2, conv_b_2, conv_W_context_2 ] ''' attentive convolution function ''' attentive_conv_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]), filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r attentive_conv_layer_2 = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[1]), filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1), W=conv_W_2, b=conv_b_2, W_context=conv_W_context_2, b_context=conv_b_context_2) attentive_sent_embeddings_l_2 = attentive_conv_layer_2.attentive_maxpool_vec_l attentive_sent_embeddings_r_2 = attentive_conv_layer_2.attentive_maxpool_vec_r #form input to HL layers HL_layer_1_input = T.concatenate([ attentive_sent_embeddings_l, attentive_sent_embeddings_r, attentive_sent_embeddings_l * attentive_sent_embeddings_r, attentive_sent_embeddings_l_2, attentive_sent_embeddings_r_2, attentive_sent_embeddings_l_2 * attentive_sent_embeddings_r_2 ], axis=1) HL_layer_1_input_size = 6 * hidden_size[1] HL_layer_1 = HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[1], activation=T.nnet.relu) HL_layer_2 = HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[1], n_out=hidden_size[1], activation=T.nnet.relu) # LR_input_size=HL_layer_1_input_size+2*hidden_size[0] "form input to LR classifier" LR_input = T.tanh( T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output], axis=1)) LR_input_size = HL_layer_1_input_size + 2 * hidden_size[1] U_a = create_ensemble_para( rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' test_preds = T.argmax(layer_LR.p_y_given_x, axis=1) transfered_preds = T.eq(test_preds, 2) test_error = T.mean(T.neq(transfered_preds, labels)) params = [init_embeddings ] + NN_para + HL_layer_1.params + HL_layer_2.params + LR_para cost = loss updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], [test_error, transfered_preds], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 max_f1 = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_labels = [] gold_labels = [] error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i, pred_labels_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i pred_labels += list(pred_labels_i) gold_labels += list( test_labels_store[test_batch_id:test_batch_id + batch_size]) test_acc = 1.0 - error_sum / (len(test_batch_start)) test_f1 = f1_score_2_binary_list( gold_labels, pred_labels) #, average='binary') if test_acc > max_acc_test: max_acc_test = test_acc if test_f1 > max_f1: max_f1 = test_f1 # store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_five_copies_'+str(max_acc_test), params) print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test, '\t\t test_f1:', test_f1, '\t\tmax F1:', max_f1 print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, batch_size=20, test_batch_size=200, emb_size=300, hidden_size=300, L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=50.302743615): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) # glove_vocab=set(word2vec.keys()) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in overall_word2id.iteritems()} # word2vec=load_glove() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') # labels = T.imatrix('labels') #(batch, para_len) gold_indices= T.ivector() #batch para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) is_train = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size=paragraph.shape[0] norm_extraF=normalize_matrix(extraF) U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] U_e1, W_e1, b_e1=create_GRU_para(rng, 3*hidden_size+3, hidden_size) U_e1_b, W_e1_b, b_e1_b=create_GRU_para(rng, 3*hidden_size+3, hidden_size) paragraph_para_e1=[U_e1, W_e1, b_e1, U_e1_b, W_e1_b, b_e1_b] UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size) Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] # W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) # W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, 2*hidden_size) # 3 extra features # LR_b = theano.shared(value=numpy.zeros((2,), # dtype=theano.config.floatX), # @UndefinedVariable # name='LR_b', borrow=True) HL_paras=[U_a] params = [embeddings]+paragraph_para+Q_para+paragraph_para_e1+HL_paras load_model_from_file(rootPath+'Best_Paras_conv_50.302743614', params) paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1) paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) para_reps=paragraph_model.output_tensor #(batch, emb, para_len) # #LSTM # fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters # paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) # para_reps=paragraph_model.output_tensor Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) questions_reps_tensor=questions_model.output_tensor questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden) questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #(batch, para_len, hidden) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor # def example_in_batch(para_matrix, q_matrix): #assume both are (hidden, len) transpose_para_matrix=para_matrix.T interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix=T.nnet.softmax(interaction_matrix) # norm_interaction_matrix=T.maximum(0.0, interaction_matrix) return T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(len, para_len) batch_q_reps, updates = theano.scan(fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor]) #batch_q_reps (batch, hidden, para_len) #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len) ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), norm_extraF.dimshuffle(0,2,1)], axis=1) #(batch, 3*hidden+3, para_len) para_ensemble_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,U=U_e1,W=W_e1,b=b_e1,Ub=U_e1_b,Wb=W_e1_b,bb=b_e1_b) para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, hidden ,para_len) para_reps_tensor4score = dropout_standard(is_train, para_reps_tensor4score, 0.2, rng) #for span reps span_1=T.concatenate([para_reps_tensor4score, para_reps_tensor4score], axis=1) #(batch, 2*hidden ,para_len) span_2=T.concatenate([para_reps_tensor4score[:,:,:-1], para_reps_tensor4score[:,:,1:]], axis=1) #(batch, 2*hidden ,para_len-1) span_3=T.concatenate([para_reps_tensor4score[:,:,:-2], para_reps_tensor4score[:,:,2:]], axis=1) #(batch, 2*hidden ,para_len-2) span_4=T.concatenate([para_reps_tensor4score[:,:,:-3], para_reps_tensor4score[:,:,3:]], axis=1) #(batch, 2*hidden ,para_len-3) span_5=T.concatenate([para_reps_tensor4score[:,:,:-4], para_reps_tensor4score[:,:,4:]], axis=1) #(batch, 2*hidden ,para_len-4) span_6=T.concatenate([para_reps_tensor4score[:,:,:-5], para_reps_tensor4score[:,:,5:]], axis=1) #(batch, 2*hidden ,para_len-5) span_7=T.concatenate([para_reps_tensor4score[:,:,:-6], para_reps_tensor4score[:,:,6:]], axis=1) #(batch, 2*hidden ,para_len-6) span_8=T.concatenate([para_reps_tensor4score[:,:,:-7], para_reps_tensor4score[:,:,7:]], axis=1) #(batch, 2*hidden ,para_len-7) span_9=T.concatenate([para_reps_tensor4score[:,:,:-8], para_reps_tensor4score[:,:,8:]], axis=1) #(batch, 2*hidden ,para_len-8) span_10=T.concatenate([para_reps_tensor4score[:,:,:-9], para_reps_tensor4score[:,:,9:]], axis=1) #(batch, 2*hidden ,para_len-9) span_11=T.concatenate([para_reps_tensor4score[:,:,:-10], para_reps_tensor4score[:,:,10:]], axis=1) #(batch, 2*hidden ,para_len-10) span_12=T.concatenate([para_reps_tensor4score[:,:,:-11], para_reps_tensor4score[:,:,11:]], axis=1) #(batch, 2*hidden ,para_len-11) span_13=T.concatenate([para_reps_tensor4score[:,:,:-12], para_reps_tensor4score[:,:,12:]], axis=1) #(batch, 2*hidden ,para_len-12) span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7, span_8, span_9, span_10, span_11, span_12, span_13], axis=2) #(batch, 2*hidden, 13*para_len-78) test_span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7], axis=2) #(batch, 2*hidden, 5*para_len-10) #, span_6, span_7 #score each span reps norm_U_a=normalize_matrix(U_a) span_scores_tensor=T.dot(span_reps.dimshuffle(0,2,1), norm_U_a) #(batch, 13*para_len-78, 1) span_scores=T.nnet.softmax(span_scores_tensor.reshape((true_batch_size, 13*paragraph.shape[1]-78))) #(batch, 7*para_len-21) loss=-T.sum(T.log(span_scores[T.arange(true_batch_size), gold_indices])) test_span_scores_tensor=T.dot(test_span_reps.dimshuffle(0,2,1), norm_U_a) #(batch, 7*para_len-21, 1) test_span_scores=T.nnet.softmax(test_span_scores_tensor.reshape((true_batch_size, 7*paragraph.shape[1]-21))) #(batch, 7*para_len-21) test_return=T.argmax(test_span_scores, axis=1) #batch #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) # L2_reg = L2norm_paraList([embeddings]) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, extraF, is_train], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF, is_train], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_label_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), 1) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') # writefile.write('{') pred_dict={} # exact_match=0.0 # F1_match=0.0 q_amount=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), 0) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size] # para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size] q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] # print 'q_ids_batch:', q_ids_batch # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] # para_len=len(test_para_wordlist_list[0]) # if para_len!=len(distribution_matrix[0]): # print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) # exit(0) # q_size=len(distribution_matrix) q_amount+=test_batch_size # print q_size # print test_para_word_list # Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size] for q in range(test_batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_list[q]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans # writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ') # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) # q_gold_ans_set=para_gold_ansset_list[q] # # print test_para_wordlist_list[q] # # print Q_list_inword[q] # # print pred_ans.encode('utf8'), q_gold_ans_set # if pred_ans in q_gold_ans_set: # exact_match+=1 # F1=MacroF1(pred_ans, q_gold_ans_set) # F1_match+=F1 with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') # F1_acc=F1_match/q_amount # exact_acc=exact_match/q_amount if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Paras_conv_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc # os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt') if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, L2_weight=0.000001, drop_p=0.05, emb_size=300, hidden_size=500, HL_hidden_size=500, batch_size=5, filter_size=[3, 5, 7], maxSentLen=180, comment=''): model_options = locals().copy() print "model options", model_options rng = np.random.RandomState( 1234) #random seed, control the model generates the same results srng = RandomStreams(rng.randint(999999)) all_sentences, all_masks, all_labels, word2id = load_yelp_dataset( maxlen=maxSentLen, minlen=2 ) #minlen, include one label, at least one word in the sentence train_sents = np.asarray(all_sentences[0], dtype='int32') train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels = np.asarray(all_labels[0], dtype='int32') train_size = len(train_labels) dev_sents = np.asarray(all_sentences[1], dtype='int32') dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels = np.asarray(all_labels[1], dtype='int32') dev_size = len(dev_labels) test_sents = np.asarray(all_sentences[2], dtype='int32') test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels = np.asarray(all_labels[2], dtype='int32') test_size = len(test_labels) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec_file('glove.840B.300d.txt') # rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.ivector('labels') train_flag = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM # drop_common_input = dropout_layer(srng, common_input, drop_p, train_flag) bow = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) #(batch, emb_size) gate_filter_shape = (emb_size, 1, emb_size, 1) conv_W_2_pre, conv_b_2_pre = create_conv_para( rng, filter_shape=gate_filter_shape) conv_W_2_gate, conv_b_2_gate = create_conv_para( rng, filter_shape=gate_filter_shape) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size, 1)) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[1])) conv_W2_context, conv_b2_context = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size, 1)) # conv_W3, conv_b3=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[2])) # conv_W3_context, conv_b3_context=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 1)) # conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) soft_att_W_big, soft_att_b_big = create_HiddenLayer_para( rng, emb_size * 2, emb_size) soft_att_W_small, _ = create_HiddenLayer_para(rng, emb_size, 1) soft_att_W2_big, soft_att_b2_big = create_HiddenLayer_para( rng, emb_size * 2, emb_size) soft_att_W2_small, _ = create_HiddenLayer_para(rng, emb_size, 1) # soft_att_W3_big, soft_att_b3_big = create_HiddenLayer_para(rng, emb_size*2, emb_size) # soft_att_W3_small, _ = create_HiddenLayer_para(rng, emb_size, 1) NN_para = [ conv_W_2_pre, conv_b_2_pre, conv_W_2_gate, conv_b_2_gate, conv_W, conv_b, conv_W_context, conv_W2, conv_b2, conv_W2_context, # conv_W3, conv_b3,conv_W3_context, soft_att_W_big, soft_att_b_big, soft_att_W_small, soft_att_W2_big, soft_att_b2_big, soft_att_W2_small # soft_att_W3_big, soft_att_b3_big,soft_att_W3_small ] #,conv_W3, conv_b3,conv_W3_context] conv_layer_1_gate_l = Conv_with_Mask_with_Gate( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_2_pre, b=conv_b_2_pre, W_gate=conv_W_2_gate, b_gate=conv_b_2_gate) advanced_sent_tensor3 = conv_layer_1_gate_l.output_tensor3 # conv_layer_pair = Conv_for_Pair(rng, # origin_input_tensor3=advanced_sent_tensor3, # origin_input_tensor3_r = advanced_sent_tensor3, # input_tensor3=advanced_sent_tensor3, # input_tensor3_r = advanced_sent_tensor3, # mask_matrix = sents_mask, # mask_matrix_r = sents_mask, # image_shape=(batch_size, 1, emb_size, maxSentLen), # image_shape_r = (batch_size, 1, emb_size, maxSentLen), # filter_shape=(hidden_size, 1, emb_size, filter_size[0]), # filter_shape_context=(hidden_size, 1, emb_size, 1), # W=conv_W, b=conv_b, # W_context=conv_W_context, b_context=conv_b_context) conv_layer_pair = Conv_for_Pair_SoftAttend( rng, origin_input_tensor3=advanced_sent_tensor3, origin_input_tensor3_r=advanced_sent_tensor3, input_tensor3=advanced_sent_tensor3, input_tensor3_r=advanced_sent_tensor3, mask_matrix=sents_mask, mask_matrix_r=sents_mask, filter_shape=(hidden_size, 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size, 1, emb_size, 1), image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context, soft_att_W_big=soft_att_W_big, soft_att_b_big=soft_att_b_big, soft_att_W_small=soft_att_W_small) # conv_layer_2_pair = Conv_for_Pair(rng, # origin_input_tensor3=advanced_sent_tensor3, # origin_input_tensor3_r = advanced_sent_tensor3, # input_tensor3=advanced_sent_tensor3, # input_tensor3_r = advanced_sent_tensor3, # mask_matrix = sents_mask, # mask_matrix_r = sents_mask, # image_shape=(batch_size, 1, emb_size, maxSentLen), # image_shape_r = (batch_size, 1, emb_size, maxSentLen), # filter_shape=(hidden_size, 1, emb_size, filter_size[1]), # filter_shape_context=(hidden_size, 1, emb_size, 1), # W=conv_W2, b=conv_b2, # W_context=conv_W2_context, b_context=conv_b2_context) conv_layer_2_pair = Conv_for_Pair_SoftAttend( rng, origin_input_tensor3=advanced_sent_tensor3, origin_input_tensor3_r=advanced_sent_tensor3, input_tensor3=advanced_sent_tensor3, input_tensor3_r=advanced_sent_tensor3, mask_matrix=sents_mask, mask_matrix_r=sents_mask, filter_shape=(hidden_size, 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size, 1, emb_size, 1), image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), W=conv_W2, b=conv_b2, W_context=conv_W2_context, b_context=conv_b2_context, soft_att_W_big=soft_att_W2_big, soft_att_b_big=soft_att_b2_big, soft_att_W_small=soft_att_W2_small) # conv_layer_3_pair = Conv_for_Pair_SoftAttend(rng, # origin_input_tensor3=advanced_sent_tensor3, # origin_input_tensor3_r=advanced_sent_tensor3, # input_tensor3=advanced_sent_tensor3, # input_tensor3_r=advanced_sent_tensor3, # mask_matrix=sents_mask, # mask_matrix_r=sents_mask, # filter_shape=(hidden_size, 1, emb_size, filter_size[2]), # filter_shape_context=(hidden_size, 1, emb_size, 1), # image_shape=(batch_size, 1, emb_size, maxSentLen), # image_shape_r= (batch_size, 1, emb_size, maxSentLen), # W=conv_W3, b=conv_b3, # W_context=conv_W3_context, b_context=conv_b3_context, # soft_att_W_big=soft_att_W3_big, soft_att_b_big=soft_att_b3_big, # soft_att_W_small=soft_att_W3_small) # biased_sent_embeddings = conv_layer_pair.biased_attentive_maxpool_vec_l sent_embeddings = conv_layer_pair.maxpool_vec_l att_sent_embeddings = conv_layer_pair.attentive_maxpool_vec_l sent_embeddings_2 = conv_layer_2_pair.maxpool_vec_l att_sent_embeddings_2 = conv_layer_2_pair.attentive_maxpool_vec_l #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative HL_input = T.concatenate( [ bow, sent_embeddings, att_sent_embeddings, sent_embeddings_2, att_sent_embeddings_2 # sent_embeddings_3,att_sent_embeddings_3, ], axis=1) HL_input_size = hidden_size * 4 + emb_size HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, HL_input_size, HL_hidden_size) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=HL_hidden_size, W=HL_layer_1_W, b=HL_layer_1_b, activation=T.nnet.relu) # HL_layer_1_output = dropout_layer(srng, HL_layer_1.output, drop_p, train_flag) HL_layer_2_W, HL_layer_2_b = create_HiddenLayer_para( rng, HL_hidden_size, HL_hidden_size) HL_layer_2_params = [HL_layer_2_W, HL_layer_2_b] HL_layer_2 = HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, W=HL_layer_2_W, b=HL_layer_2_b, activation=T.nnet.relu) # HL_layer_2_output = dropout_layer(srng, HL_layer_2.output, drop_p, train_flag) LR_input = T.concatenate([HL_input, HL_layer_1.output, HL_layer_2.output], axis=1) # drop_LR_input = dropout_layer(srng, LR_input, drop_p, train_flag) LR_input_size = HL_input_size + 2 * HL_hidden_size U_a = create_ensemble_para( rng, 5, LR_input_size) # the weight matrix hidden_size*2 # norm_W_a = normalize_matrix(U_a) LR_b = theano.shared(value=np.zeros((5, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=5, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [ embeddings ] + NN_para + HL_layer_1_params + HL_layer_2_params + LR_para # put all model parameters together L2_reg = L2norm_paraList([ embeddings, conv_W_2_pre, conv_W_2_gate, conv_W, conv_W_context, conv_W2, conv_W2_context, soft_att_W_big, soft_att_W_small, soft_att_W2_big, soft_att_W2_small, HL_layer_1_W, HL_layer_2_W, U_a ]) # diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix) cost = loss #+L2_weight*L2_reg grads = T.grad( cost, params) # create a list of gradients for all model parameters accumulator = [] for para_i in params: eps_p = np.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #1e-8 is add to get rid of zero division updates.append((acc_i, acc)) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function( [sents_id_matrix, sents_mask, labels, train_flag], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_id_matrix, sents_mask, labels, train_flag], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, labels, train_flag], [layer_LR.errors(labels), layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 # combined = zip(train_sents, train_masks, train_labels) random.Random(200).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], 1) #after each 1000 batches, we test the performance of the model on all test data if iter % 2000 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 # writefile=open('log.'+nn+'.senti.preditions.txt', 'w') for test_batch_id in test_batch_start: # for each test batch error_i, pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], test_labels[test_batch_id:test_batch_id + batch_size], 0) # pred_labels=list(pred_labels) # if test_batch_id !=test_batch_start[-1]: # writefile.write('\n'.join(map(str,pred_labels))+'\n') # else: # writefile.write('\n'.join(map(str,pred_labels[-test_size%batch_size:]))) error_sum += error_i # writefile.close() test_accuracy = 1.0 - error_sum / (len(test_batch_start)) if test_accuracy > max_acc_test: max_acc_test = test_accuracy print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test