def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=10, test_batch_size=200, emb_size=300, hidden_size=100, L2_weight=0.0001, para_len_limit=300, q_len_limit=30, max_EM=40.0): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) # glove_vocab=set(word2vec.keys()) train_para_list, train_Q_list, train_start_list,train_end_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train_AI2(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_start_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test_AI2(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in overall_word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') # labels = T.imatrix('labels') #(batch, para_len) start_indices= T.ivector() #batch end_indices = T.ivector() #batch para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size=paragraph.shape[0] norm_extraF=normalize_matrix(extraF) fwd_para=create_LSTM_para(rng, emb_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_para=create_LSTM_para(rng, emb_size, hidden_size) paragraph_para=fwd_para.values()+ bwd_para.values() fwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size) paragraph_para_e1=fwd_e1.values()+ bwd_e1.values() fwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size) paragraph_para_e11=fwd_e11.values()+ bwd_e11.values() fwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) bwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size) paragraph_para_e2=fwd_e2.values()+ bwd_e2.values() # U_e2, W_e2, b_e2=create_GRU_para(rng, hidden_size, hidden_size) # U_e2_b, W_e2_b, b_e2_b=create_GRU_para(rng, hidden_size, hidden_size) # paragraph_para_e2=[U_e2, W_e2, b_e2, U_e2_b, W_e2_b, b_e2_b] # fwd_Q=create_LSTM_para(rng, emb_size, hidden_size) #create_LSTM_para(rng, word_dim, hidden_dim) # bwd_Q=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_Q.values()+ bwd_Q.values() # W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) # W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a1 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features U_a2 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features U_a3 = create_ensemble_para(rng, 1, 6*hidden_size) # 3 extra features # LR_b = theano.shared(value=numpy.zeros((2,), # dtype=theano.config.floatX), # @UndefinedVariable # name='LR_b', borrow=True) HL_paras=[U_a1, U_a2, U_a3] params = [embeddings]+paragraph_para+paragraph_para_e1+paragraph_para_e11+HL_paras+paragraph_para_e2 # load_model_from_file(rootPath+'Best_Paras_AI2_31.210974456', params) paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) #self, X, Mask, hidden_dim, fwd_tparams, bwd_tparams paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_para, bwd_tparams= bwd_para) para_reps=paragraph_model.output_tensor #(batch, 2*hidden, para_len) Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, fwd_tparams=fwd_para, bwd_tparams= bwd_para) questions_reps_tensor=questions_model.output_tensor #(batch, 2*hidden ,q_len) # questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden) # questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #(batch, para_len, hidden) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor # new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0) # ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2) # ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction # padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX) # ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) # ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) # ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad) norm_U_a3=normalize_matrix(U_a3) def example_in_batch(para_matrix, q_matrix): #assume both are (2*hidden, len) repeat_para_matrix_T=T.repeat(para_matrix.T, q_matrix.shape[1], axis=0) #(para_len*q_len, 2*hidden) repeat_q_matrix_3D = T.repeat(q_matrix.T.dimshuffle('x',0,1), para_matrix.shape[1], axis=0) #(para_len, q_len, 2*hidden) repeat_q_matrix_T= repeat_q_matrix_3D.reshape((repeat_q_matrix_3D.shape[0]*repeat_q_matrix_3D.shape[1], repeat_q_matrix_3D.shape[2])) #(para_len*q_len, 2*hidden) ele_mult =repeat_para_matrix_T*repeat_q_matrix_T #(#(para_len*q_len, 2*hidden)) overall_concv = T.concatenate([repeat_para_matrix_T, repeat_q_matrix_T, ele_mult], axis=1) ##(para_len*q_len, 6*hidden) scores=T.dot(overall_concv, norm_U_a3) #(para_len*q_len,1) interaction_matrix=scores.reshape((para_matrix.shape[1], q_matrix.shape[1])) #(para_len, q_len) # transpose_para_matrix=para_matrix.T # interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix=T.nnet.softmax(interaction_matrix) # norm_interaction_matrix=T.maximum(0.0, interaction_matrix) q_by_para = T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(2*hidden, para_len) para_by_q = T.repeat(T.dot(para_matrix, T.nnet.softmax(T.max(interaction_matrix, axis=1).dimshuffle('x',0)).T), para_matrix.shape[1], axis=1) return (q_by_para, para_by_q) inter_return, updates = theano.scan(fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor]) #batch_q_reps (batch, hidden, para_len) batch_q_reps=inter_return[0] #(batch, 2*hidden, para_len) batch_para_reps=inter_return[1] #(batch, 2*hidden , para_len) #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len) ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps,para_reps*batch_q_reps, para_reps*batch_para_reps], axis=1) #(batch, 4*2*hidden, para_len) questions_reps.dimshuffle(0,2,1) para_ensemble_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e1, bwd_tparams= bwd_e1) para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, 2*hidden ,para_len) para_ensemble_model1=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e11, bwd_tparams= bwd_e11) para_reps_tensor4score1=para_ensemble_model1.output_tensor #(batch, 2*hidden ,para_len) Con_G_M=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score1], axis=1) #(batch, 10*hidden, para_len) #score for each para word norm_U_a=normalize_matrix(U_a1) start_scores=T.dot(Con_G_M.dimshuffle(0,2,1), norm_U_a) #(batch, para_len, 1) start_scores=T.nnet.softmax(start_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len) # para_reps_tensor4score = T.concatenate([para_reps_tensor4score, start_scores.dimshuffle(0,'x',1)], axis=1) para_ensemble_model2=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score1, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e2, bwd_tparams= bwd_e2) para_reps_tensor4score2=para_ensemble_model2.output_tensor #(batch, 2*hidden ,para_len) Con_G_M2=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score2], axis=1) #(batch, 10*hidden, para_len) norm_U_a2=normalize_matrix(U_a2) end_scores=T.dot(Con_G_M2.dimshuffle(0,2,1), norm_U_a2) #(batch, para_len, 1) end_scores=T.nnet.softmax(end_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len) #loss train loss=-T.mean(T.log(start_scores[T.arange(true_batch_size), start_indices])+T.log(end_scores[T.arange(true_batch_size), end_indices])) #test co_simi_batch_matrix=T.batched_dot((para_mask*start_scores).dimshuffle(0,1,'x'), (para_mask*end_scores).dimshuffle(0,'x',1)) #(batch, para_len, para_len) #reset lower dialgonal cols = numpy.concatenate([numpy.array(range(i), dtype=numpy.uint) for i in xrange(para_len_limit)]) rows = numpy.concatenate([numpy.array([i]*i, dtype=numpy.uint) for i in xrange(para_len_limit)]) c = T.set_subtensor(co_simi_batch_matrix[:,rows, cols], theano.shared(numpy.zeros(para_len_limit*(para_len_limit-1)/2))) #reset longer than 7 size cols2 = numpy.concatenate([numpy.array(range(i+7,para_len_limit), dtype=numpy.uint) for i in xrange(para_len_limit-7)]) rows2 = numpy.concatenate([numpy.array([i]*(para_len_limit-7-i), dtype=numpy.uint) for i in xrange(para_len_limit-7)]) c2 = T.set_subtensor(c[:,rows2, cols2], theano.shared(numpy.zeros((para_len_limit-7)*(para_len_limit-6)/2))) test_return=T.argmax(c2.reshape((true_batch_size, para_len_limit*para_len_limit)), axis=1) #batch #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) #L2_reg = L2norm_paraList(params) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,start_indices, end_indices,para_mask, q_mask, extraF], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_start_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_end_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX)) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') # writefile.write('{') pred_dict={} # exact_match=0.0 # F1_match=0.0 q_amount=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX)) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size] # para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size] q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] # print 'q_ids_batch:', q_ids_batch # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] # para_len=len(test_para_wordlist_list[0]) # if para_len!=len(distribution_matrix[0]): # print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) # exit(0) # q_size=len(distribution_matrix) q_amount+=test_batch_size # print q_size # print test_para_word_list # Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size] for q in range(test_batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=decode_predict_id_AI2(batch_predict_ids[q], para_len_limit, test_para_wordlist_list[q]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans # writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ') # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) # q_gold_ans_set=para_gold_ansset_list[q] # # print test_para_wordlist_list[q] # # print Q_list_inword[q] # # print pred_ans.encode('utf8'), q_gold_ans_set # if pred_ans in q_gold_ans_set: # exact_match+=1 # F1=MacroF1(pred_ans, q_gold_ans_set) # F1_match+=F1 with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') # F1_acc=F1_match/q_amount # exact_acc=exact_match/q_amount if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Paras_AI2_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc # os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt') if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5], char_filter_size=3, margin=2.0, max_EM=50.302743615): test_batch_size=batch_size*10 model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) word2id={} char2id={} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len) train_size=len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size=len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_glove() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng) char_embeddings=theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices= T.imatrix() #batch, (start, end) for each sample para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask=T.fmatrix() char_q_mask=T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size)) char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len)) char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len)) conv_W_char, conv_b_char=create_conv_para(rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0])) conv_W_2, conv_b_2=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_1_q, conv_b_1_q=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) NN_para=[conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char] input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,batch_size, p_len_limit,q_len_limit, emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size, conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char, para_mask, q_mask, char_p_masks,char_q_masks) test_input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,test_batch_size, test_p_len_limit,q_len_limit, emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size, conv_W_1, conv_b_1,conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char, para_mask, q_mask, char_p_masks,char_q_masks) #(batch, hidden, #(batch, 2*hidden, p_len_limit)) # gram_size = 5*true_p_len-(0+1+2+3+4) HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a=normalize_matrix(U_a) norm_HL_1_para=normalize_matrix(HL_1_para) norm_HL_2_para=normalize_matrix(HL_2_para) norm_HL_3_para=normalize_matrix(HL_3_para) norm_HL_4_para=normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para(rng, 1, hidden_size) end_norm_U_a=normalize_matrix(end_U_a) end_norm_HL_1_para=normalize_matrix(end_HL_1_para) end_norm_HL_2_para=normalize_matrix(end_HL_2_para) end_norm_HL_3_para=normalize_matrix(end_HL_3_para) end_norm_HL_4_para=normalize_matrix(end_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, batch_size,true_p_len) span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len) end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para, end_norm_U_a, batch_size,true_p_len) end_span_scores=T.nnet.softmax(end_span_scores_matrix) #(batch, para_len) loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,0]])) end_loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,1]])) #ranking loss start tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor(index_matrix[T.arange(batch_size), gold_indices[:,0]], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega)) #ranking loss END end_tanh_span_scores_matrix = end_span_scores#T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor(end_index_matrix[T.arange(batch_size), gold_indices[:,1]], 1.0) end_prob_batch_posi = end_tanh_span_scores_matrix[end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_span_scores_matrix[(1.0-end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle('x',0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean(T.maximum(0.0, margin-end_repeat_posi+end_repeat_nega)) loss = loss_neg_likelihood +end_loss_neg_likelihood+loss_rank+end_loss_rank #test test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para,norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len) mask_test_return=T.argmax(test_span_scores_matrix*para_mask, axis=1) #batch end_test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para,end_norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len) end_mask_test_return=T.argmax(end_test_span_scores_matrix*para_mask, axis=1) #batch params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]+[end_U_a,end_HL_1_para,end_HL_2_para,end_HL_3_para,end_HL_4_para] L2_reg =L2norm_paraList([embeddings,char_embeddings,conv_W_1,conv_W_2,conv_W_1_q, conv_W_2_q, conv_W_char,U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]) #L2_reg = L2norm_paraList(params) cost=loss#+L2_weight*L2_reg accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask, true_p_len], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len], [mask_test_return,end_mask_test_return], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_ids[para_id:para_id+batch_size] cost_i+= train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter%100==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() pred_dict={} q_amount=0 p1=0 for test_para_id in test_batch_start: batch_predict_ids, batch_predict_end_ids=test_model( test_para_list[test_para_id:test_para_id+test_batch_size], test_Q_list[test_para_id:test_para_id+test_batch_size], test_para_mask[test_para_id:test_para_id+test_batch_size], test_Q_mask[test_para_id:test_para_id+test_batch_size], test_para_char_list[test_para_id:test_para_id+test_batch_size], test_Q_char_list[test_para_id:test_para_id+test_batch_size], test_para_char_mask[test_para_id:test_para_id+test_batch_size], test_Q_char_mask[test_para_id:test_para_id+test_batch_size], test_p_len_limit) test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size] # test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] # q_amount+=test_batch_size q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] q_amount+=test_batch_size for q in range(test_batch_size): #for each question # pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q]) start = batch_predict_ids[q] end = batch_predict_end_ids[q] if end < start: start, end = end, start pred_ans = ' '.join(test_para_wordlist_batch[q][start:end+1]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size=[5, 5], char_filter_size=3, margin=2.0, max_EM=50.302743615): test_batch_size = batch_size * 10 model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) word2id = {} char2id = {} #questions,paragraphs,q_masks,p_masks,labels, word2id train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id = load_squad_cnn_rank_word_train( word2id, char2id, p_len_limit, q_len_limit, char_len) train_size = len(train_para_list) test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_word_dev( word2id, char2id, test_p_len_limit, q_len_limit, char_len) test_size = len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_label_list = numpy.asarray(train_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, rng) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_glove() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, rng) char_embeddings = theano.shared(value=char_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') gold_indices = T.imatrix() #batch, (start, end) for each sample para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask = T.fmatrix() char_q_mask = T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] common_input_p = embeddings[paragraph.flatten()].reshape( (true_batch_size, true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q = embeddings[questions.flatten()].reshape( (true_batch_size, q_len_limit, emb_size)) char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape( (true_batch_size * true_p_len, char_len, char_emb_size )) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q = char_embeddings[char_questions.flatten()].reshape( (true_batch_size * q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape( (true_batch_size * true_p_len, char_len)) char_q_masks = char_q_mask.reshape( (true_batch_size * q_len_limit, char_len)) conv_W_char, conv_b_char = create_conv_para( rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1 = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size, filter_size[0])) conv_W_2, conv_b_2 = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_1_q, conv_b_1_q = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) NN_para = [ conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char ] input4score = squad_cnn_rank_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, batch_size, p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, para_mask, q_mask, char_p_masks, char_q_masks) #(batch, 4*hidden, p_len_limit) test_input4score = squad_cnn_rank_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, test_batch_size, test_p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, para_mask, q_mask, char_p_masks, char_q_masks) #(batch, 4*hidden, p_len_limit) # gram_size = 5*true_p_len-(0+1+2+3+4) HL_1_para = create_ensemble_para(rng, hidden_size, 6 * hidden_size + char_emb_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, hidden_size) norm_U_a = normalize_matrix(U_a) norm_HL_1_para = normalize_matrix(HL_1_para) norm_HL_2_para = normalize_matrix(HL_2_para) norm_HL_3_para = normalize_matrix(HL_3_para) norm_HL_4_para = normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para(rng, hidden_size, 6 * hidden_size + char_emb_size) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para(rng, 1, hidden_size) end_norm_U_a = normalize_matrix(end_U_a) end_norm_HL_1_para = normalize_matrix(end_HL_1_para) end_norm_HL_2_para = normalize_matrix(end_HL_2_para) end_norm_HL_3_para = normalize_matrix(end_HL_3_para) end_norm_HL_4_para = normalize_matrix(end_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_U_a, batch_size, true_p_len) span_scores = T.nnet.softmax(span_scores_matrix) #(batch, para_len) end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, batch_size, true_p_len) end_span_scores = T.nnet.softmax( end_span_scores_matrix) #(batch, para_len) loss_neg_likelihood = -T.mean( T.log(span_scores[T.arange(batch_size), gold_indices[:, 0]])) end_loss_neg_likelihood = -T.mean( T.log(span_scores[T.arange(batch_size), gold_indices[:, 1]])) #ranking loss start tanh_span_scores_matrix = span_scores #T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor( index_matrix[T.arange(batch_size), gold_indices[:, 0]], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0 - new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega)) #ranking loss END end_tanh_span_scores_matrix = end_span_scores #T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor( end_index_matrix[T.arange(batch_size), gold_indices[:, 1]], 1.0) end_prob_batch_posi = end_tanh_span_scores_matrix[ end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_span_scores_matrix[( 1.0 - end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle( 'x', 0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean( T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega)) loss = loss_neg_likelihood + end_loss_neg_likelihood + loss_rank + end_loss_rank #test test_span_scores_matrix = add_HLs_2_tensor3( test_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_U_a, true_batch_size, true_p_len) #(batch, test_p_len) mask_test_return = T.argmax(test_span_scores_matrix * para_mask, axis=1) #batch end_test_span_scores_matrix = add_HLs_2_tensor3( test_input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size, true_p_len) #(batch, test_p_len) end_mask_test_return = T.argmax(end_test_span_scores_matrix * para_mask, axis=1) #batch params = ( [embeddings, char_embeddings] + NN_para + [U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] + [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para]) L2_reg = L2norm_paraList([ embeddings, char_embeddings, conv_W_1, conv_W_2, conv_W_1_q, conv_W_2_q, conv_W_char, U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para ]) #L2_reg = L2norm_paraList(params) cost = loss + L2_weight * L2_reg accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function( [ paragraph, questions, gold_indices, para_mask, q_mask, char_paragraph, #(batch, char_len*p_len) char_questions, char_para_mask, char_q_mask, true_p_len ], cost, updates=updates, on_unused_input='ignore') test_model = theano.function([ paragraph, questions, para_mask, q_mask, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len ], [mask_test_return, end_mask_test_return], on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / test_batch_size # remain_test=test_size%batch_size test_batch_start = list(numpy.arange(n_test_batches) * test_batch_size) + [test_size - test_batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.Random(4).shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_ids[para_id:para_id + batch_size] cost_i += train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_label_list[train_id_batch], train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() pred_dict = {} q_amount = 0 p1 = 0 for test_para_id in test_batch_start: batch_predict_ids, batch_predict_end_ids = test_model( test_para_list[test_para_id:test_para_id + test_batch_size], test_Q_list[test_para_id:test_para_id + test_batch_size], test_para_mask[test_para_id:test_para_id + test_batch_size], test_Q_mask[test_para_id:test_para_id + test_batch_size], test_para_char_list[test_para_id:test_para_id + test_batch_size], test_Q_char_list[test_para_id:test_para_id + test_batch_size], test_para_char_mask[test_para_id:test_para_id + test_batch_size], test_Q_char_mask[test_para_id:test_para_id + test_batch_size], test_p_len_limit) test_para_wordlist_batch = test_para_wordlist_list[ test_para_id:test_para_id + test_batch_size] # test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size] # q_amount+=test_batch_size q_ids_batch = q_idlist[test_para_id:test_para_id + test_batch_size] q_amount += test_batch_size for q in range(test_batch_size): #for each question # pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q]) start = batch_predict_ids[q] end = batch_predict_end_ids[q] if end < start: start, end = end, start pred_ans = ' '.join( test_para_wordlist_batch[q][start:end + 1]) q_id = q_ids_batch[q] pred_dict[q_id] = pred_ans with codecs.open(rootPath + 'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json', rootPath + 'predictions.txt') if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=10, L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size=[5, 5, 5, 5, 5], char_filter_size=5, margin=0.85, extra_size=5 + 11, extra_emb=10, distance=10, distance_emb=10, comment='add distance embs'): #extra_size=3+46+7 test_batch_size = batch_size model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/' rng = numpy.random.RandomState(23455) word2id = {} char2id = {} train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras, word2id, char2id = load_squad_cnn_rank_span_word_train( word2id, char2id, p_len_limit, q_len_limit, char_len) test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_span_word_dev( word2id, char2id, test_p_len_limit, q_len_limit, char_len) ''' #store variables into file ''' # train_variables = [train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras] # test_variables =[test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list] # with open(rootPath+'extra.3.pickle', 'wb') as f: # Python 3: open(..., 'wb') # cPickle.dump(train_variables+test_variables, f, protocol=cPickle.HIGHEST_PROTOCOL) # f.close() # print 'variable stored successfully' # exit(0) ''' load variables from file ''' # before_load_time = time.time() # with open(rootPath+'extra.3.pickle', 'rb') as f: # Python 3: open(..., 'rb') # train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras,test_Q_list, test_para_list, test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = cPickle.load(f) # f.close() # print 'load data variables successfully, spend: ', (time.time()-before_load_time)/60.0, ' mins' train_size = len(train_para_list) test_size = len(test_para_list) train_Q_list = numpy.asarray(train_Q_list, dtype='int32') train_para_list = numpy.asarray(train_para_list, dtype='int32') train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX) train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX) train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32') train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32') train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX) train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX) train_para_extras = numpy.asarray(train_para_extras, dtype=theano.config.floatX) train_span_label_list = numpy.asarray(train_span_label_list, dtype='int32') train_word_label_list = numpy.asarray(train_word_label_list, dtype='int32') test_Q_list = numpy.asarray(test_Q_list, dtype='int32') test_para_list = numpy.asarray(test_para_list, dtype='int32') test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX) test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX) test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32') test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32') test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX) test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX) test_para_extras = numpy.asarray(test_para_extras, dtype=theano.config.floatX) vocab_size = len(word2id) print 'vocab size: ', vocab_size rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, rng) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_glove() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared(value=rand_values, borrow=True) char_size = len(char2id) print 'char size: ', char_size char_rand_values = random_value_normal((char_size + 1, char_emb_size), theano.config.floatX, rng) char_embeddings = theano.shared(value=char_rand_values, borrow=True) extra_rand_values = random_value_normal((extra_size, extra_emb), theano.config.floatX, rng) extra_embeddings = theano.shared(value=extra_rand_values, borrow=True) distance_rand_values = random_value_normal( (2 * distance + 1, distance_emb), theano.config.floatX, rng) distance_embeddings = theano.shared(value=distance_rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') span_indices = T.ivector() #batch word_indices = T.imatrix() #(batch, 2) ans_indices = T.ivector() # for one batch, the length is dynamic para_mask = T.fmatrix('para_mask') q_mask = T.fmatrix('q_mask') extra = T.ftensor3() #(batch, p_len, 3) char_paragraph = T.imatrix() #(batch, char_len*p_len) char_questions = T.imatrix() char_para_mask = T.fmatrix() char_q_mask = T.fmatrix() true_p_len = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size = paragraph.shape[0] extra_rep_batch = T.concatenate( [extra.dot(extra_embeddings), extra], axis=2) #(batch, p_len, extra_emb+extra_size) zero_pad = T.zeros((true_batch_size, 1, extra_emb + extra_size)) left_context = T.concatenate([zero_pad, extra_rep_batch[:, :-1, :]], axis=1) #(batch, p_len, extra_emb+extra_size) right_context = T.concatenate( [extra_rep_batch[:, 1:, :], zero_pad], axis=1) #(batch, p_len, extra_emb+extra_size) left_context_2 = T.concatenate( [zero_pad, zero_pad, extra_rep_batch[:, :-2, :]], axis=1) #(batch, p_len, extra_emb+extra_size) right_context_2 = T.concatenate( [extra_rep_batch[:, 2:, :], zero_pad, zero_pad], axis=1) #(batch, p_len, extra_emb+extra_size) simi2left = T.sum(extra_rep_batch * left_context, axis=2).dimshuffle(0, 1, 'x') #(batch, p_len, 1) simi2right = T.sum(extra_rep_batch * right_context, axis=2).dimshuffle(0, 1, 'x') #(batch, p_len, 1) cos2left = cosine_tensor3(extra_rep_batch, left_context, 2).dimshuffle(0, 1, 'x') cos2right = cosine_tensor3(extra_rep_batch, right_context, 2).dimshuffle(0, 1, 'x') diff2left = extra_rep_batch - left_context diff2right = extra_rep_batch - right_context #(batch, p_len, extra_emb+extra_size) extra_rep_batch = T.concatenate( [ extra_rep_batch, left_context, right_context, left_context_2, right_context_2, diff2left, diff2right, simi2left, simi2right, cos2left, cos2right ], axis=2) #batch, p_len, 7*(extra_emb+extra_size)+4) true_extra_size = 7 * (extra_emb + extra_size) + 4 common_input_p = embeddings[paragraph.flatten()].reshape( (true_batch_size, true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_q = embeddings[questions.flatten()].reshape( (true_batch_size, q_len_limit, emb_size)) char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape( (true_batch_size * true_p_len, char_len, char_emb_size )) #the input format can be adapted into CNN or GRU or LSTM char_common_input_q = char_embeddings[char_questions.flatten()].reshape( (true_batch_size * q_len_limit, char_len, char_emb_size)) char_p_masks = char_para_mask.reshape( (true_batch_size * true_p_len, char_len)) char_q_masks = char_q_mask.reshape( (true_batch_size * q_len_limit, char_len)) conv_W_char, conv_b_char = create_conv_para( rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size)) conv_W_1, conv_b_1 = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size + true_extra_size, filter_size[0])) conv_W_2, conv_b_2 = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_3, conv_b_3 = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[2])) # conv_W_4, conv_b_4=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3])) # conv_W_5, conv_b_5=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4])) conv_W_1_q, conv_b_1_q = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + char_emb_size, filter_size[0])) conv_W_2_q, conv_b_2_q = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1])) conv_W_3_q, conv_b_3_q = create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[2])) # conv_W_4_q, conv_b_4_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3])) # conv_W_5_q, conv_b_5_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4])) CNN_para = [ conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_3, conv_b_3, conv_W_3_q, conv_b_3_q, # conv_W_4, conv_b_4, conv_W_5, conv_b_5,conv_W_4_q, conv_b_4_q, conv_W_5_q, conv_b_5_q, conv_W_char, conv_b_char ] span_input4score, word_input4score, overall_span_hidden_size, overall_word_hidden_size = squad_cnn_rank_spans_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, batch_size, p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, conv_W_3, conv_b_3, conv_W_3_q, conv_b_3_q, # conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q, # conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q, para_mask, q_mask, char_p_masks, char_q_masks, extra_rep_batch, true_extra_size) test_span_input4score, test_word_input4score, _, _ = squad_cnn_rank_spans_word( rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q, test_batch_size, test_p_len_limit, q_len_limit, emb_size, char_emb_size, char_len, filter_size, char_filter_size, hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, conv_W_3, conv_b_3, conv_W_3_q, conv_b_3_q, # conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q, # conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q, para_mask, q_mask, char_p_masks, char_q_masks, extra_rep_batch, true_extra_size) #(batch, hidden, gram_size) gram_size = 5 * true_p_len - (0 + 1 + 2 + 3 + 4) # U_a = create_ensemble_para(rng, 1, 4*hidden_size) # norm_U_a=normalize_matrix(U_a) # span_scores_matrix=T.dot(span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((batch_size, gram_size)) #(batch, 13*para_len-78, 1) span_HL_1_para = create_ensemble_para(rng, hidden_size, overall_span_hidden_size) span_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) span_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) span_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) span_U_a = create_ensemble_para(rng, 1, hidden_size + overall_span_hidden_size) norm_span_U_a = normalize_matrix(span_U_a) norm_span_HL_1_para = normalize_matrix(span_HL_1_para) norm_span_HL_2_para = normalize_matrix(span_HL_2_para) norm_span_HL_3_para = normalize_matrix(span_HL_3_para) norm_span_HL_4_para = normalize_matrix(span_HL_4_para) span_scores_matrix = add_HLs_2_tensor3(span_input4score, norm_span_HL_1_para, norm_span_HL_2_para, norm_span_HL_3_para, norm_span_HL_4_para, norm_span_U_a, batch_size, gram_size) span_scores = T.nnet.softmax(span_scores_matrix) #(batch, 7*para_len-21) loss_neg_likelihood = -T.mean( T.log(span_scores[T.arange(batch_size), span_indices])) #ranking loss tanh_span_scores_matrix = span_scores #T.tanh(span_scores_matrix) #(batch, gram_size) index_matrix = T.zeros((batch_size, gram_size), dtype=theano.config.floatX) new_index_matrix = T.set_subtensor( index_matrix[T.arange(batch_size), span_indices], 1.0) prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()] prob_batch_nega = tanh_span_scores_matrix[(1.0 - new_index_matrix).nonzero()] repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0) repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0), prob_batch_posi.shape[0], axis=0).flatten() loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega)) span_loss = loss_neg_likelihood + loss_rank # test_span_scores_matrix=T.dot(test_span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((true_batch_size, gram_size)) #(batch, 13*para_len-78) test_span_scores_matrix = add_HLs_2_tensor3( test_span_input4score, norm_span_HL_1_para, norm_span_HL_2_para, norm_span_HL_3_para, norm_span_HL_4_para, norm_span_U_a, true_batch_size, gram_size) #word HL_1_para = create_ensemble_para(rng, hidden_size, overall_word_hidden_size) HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) start_U_a = create_ensemble_para(rng, 1, hidden_size + overall_word_hidden_size) norm_start_U_a = normalize_matrix(start_U_a) norm_HL_1_para = normalize_matrix(HL_1_para) norm_HL_2_para = normalize_matrix(HL_2_para) norm_HL_3_para = normalize_matrix(HL_3_para) norm_HL_4_para = normalize_matrix(HL_4_para) end_HL_1_para = create_ensemble_para( rng, hidden_size, overall_word_hidden_size + distance_emb) end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) end_U_a = create_ensemble_para( rng, 1, hidden_size + overall_word_hidden_size + distance_emb) end_norm_U_a = normalize_matrix(end_U_a) end_norm_HL_1_para = normalize_matrix(end_HL_1_para) end_norm_HL_2_para = normalize_matrix(end_HL_2_para) end_norm_HL_3_para = normalize_matrix(end_HL_3_para) end_norm_HL_4_para = normalize_matrix(end_HL_4_para) start_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_start_U_a, batch_size, true_p_len) start_scores = T.nnet.softmax(start_scores_matrix) #(batch, para_len) ''' forward start info to end prediction ''' distance_matrix = word_indices[:, 0].dimshuffle( 0, 'x') - T.arange(true_p_len).dimshuffle('x', 0) #(batch, p_len) distance_trunc_matrix = T.maximum( -distance, T.minimum(distance, distance_matrix)) + distance #(batch, p_len) zero_distance_matrix = T.zeros( (true_batch_size * true_p_len, 2 * distance + 1)) filled_distance_matrix = T.set_subtensor( zero_distance_matrix[T.arange(true_batch_size * true_p_len), distance_trunc_matrix.flatten()], 1.0) filled_distance_tensor3 = filled_distance_matrix.reshape( (true_batch_size, true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle( 0, 2, 1) #(batch_size, distance_emb, p_len) end_word_input4score = T.concatenate( [word_input4score, filled_distance_tensor3], axis=1) #(batch, +distance_emb, p_len) end_scores_matrix = add_HLs_2_tensor3(end_word_input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, batch_size, true_p_len) end_scores = T.nnet.softmax(end_scores_matrix) #(batch, para_len) start_loss_neg_likelihood = -T.mean( T.log(start_scores[T.arange(batch_size), word_indices[:, 0]])) end_loss_neg_likelihood = -T.mean( T.log(end_scores[T.arange(batch_size), word_indices[:, 1]])) #ranking loss start tanh_start_scores_matrix = start_scores #T.tanh(span_scores_matrix) #(batch, gram_size) start_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) start_new_index_matrix = T.set_subtensor( start_index_matrix[T.arange(batch_size), word_indices[:, 0]], 1.0) start_prob_batch_posi = tanh_start_scores_matrix[ start_new_index_matrix.nonzero()] start_prob_batch_nega = tanh_start_scores_matrix[( 1.0 - start_new_index_matrix).nonzero()] start_repeat_posi = T.extra_ops.repeat(start_prob_batch_posi, start_prob_batch_nega.shape[0], axis=0) start_repeat_nega = T.extra_ops.repeat(start_prob_batch_nega.dimshuffle( 'x', 0), start_prob_batch_posi.shape[0], axis=0).flatten() start_loss_rank = T.mean( T.maximum(0.0, margin - start_repeat_posi + start_repeat_nega)) #ranking loss END end_tanh_scores_matrix = end_scores #T.tanh(span_scores_matrix) #(batch, gram_size) end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX) end_new_index_matrix = T.set_subtensor( end_index_matrix[T.arange(batch_size), word_indices[:, 1]], 1.0) end_prob_batch_posi = end_tanh_scores_matrix[ end_new_index_matrix.nonzero()] end_prob_batch_nega = end_tanh_scores_matrix[( 1.0 - end_new_index_matrix).nonzero()] end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0) end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle( 'x', 0), end_prob_batch_posi.shape[0], axis=0).flatten() end_loss_rank = T.mean( T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega)) word_loss = start_loss_neg_likelihood + end_loss_neg_likelihood + start_loss_rank + end_loss_rank #test test_start_scores_matrix = add_HLs_2_tensor3( test_word_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para, norm_HL_4_para, norm_start_U_a, true_batch_size, true_p_len) #(batch, test_p_len) mask_test_start_return = test_start_scores_matrix * para_mask #(batch, p_len) ''' forward start info to end prediction in testing ''' test_distance_matrix = T.argmax(mask_test_start_return, axis=1).dimshuffle( 0, 'x') - T.arange(true_p_len).dimshuffle('x', 0) #(batch, p_len) test_distance_trunc_matrix = T.maximum( -distance, T.minimum(distance, test_distance_matrix)) + distance #(batch, p_len) test_zero_distance_matrix = T.zeros( (true_batch_size * true_p_len, 2 * distance + 1)) test_filled_distance_matrix = T.set_subtensor( test_zero_distance_matrix[T.arange(true_batch_size * true_p_len), test_distance_trunc_matrix.flatten()], 1.0) test_filled_distance_tensor3 = test_filled_distance_matrix.reshape( (true_batch_size, true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle( 0, 2, 1) #(batch_size, distance_emb, p_len) test_end_word_input4score = T.concatenate( [test_word_input4score, test_filled_distance_tensor3], axis=1) #(batch, +distance-emb, p_len) end_test_scores_matrix = add_HLs_2_tensor3( test_end_word_input4score, end_norm_HL_1_para, end_norm_HL_2_para, end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size, true_p_len) #(batch, test_p_len) end_mask_test_return = end_test_scores_matrix * para_mask #(batch, p_len) word_gram_1 = mask_test_start_return + end_mask_test_return word_gram_2 = mask_test_start_return[:, : -1] + end_mask_test_return[:, 1:] #(batch* hidden_size, maxsenlen-1) word_gram_3 = mask_test_start_return[:, : -2] + end_mask_test_return[:, 2:] #(batch* hidden_size, maxsenlen-2) word_gram_4 = mask_test_start_return[:, : -3] + end_mask_test_return[:, 3:] #(batch* hidden_size, maxsenlen-3) word_gram_5 = mask_test_start_return[:, : -4] + end_mask_test_return[:, 4:] #(batch* hidden_size, maxsenlen-4) word_pair_scores = T.concatenate( [word_gram_1, word_gram_2, word_gram_3, word_gram_4, word_gram_5], axis=1) #(batch_size, gram_size) #ans words train ans_HL_1_para = create_ensemble_para(rng, hidden_size, overall_word_hidden_size) ans_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size) ans_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size) ans_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size) ans_U_a = create_ensemble_para(rng, 1, hidden_size + overall_word_hidden_size) norm_ans_U_a = normalize_matrix(ans_U_a) norm_ans_HL_1_para = normalize_matrix(ans_HL_1_para) norm_ans_HL_2_para = normalize_matrix(ans_HL_2_para) norm_ans_HL_3_para = normalize_matrix(ans_HL_3_para) norm_ans_HL_4_para = normalize_matrix(ans_HL_4_para) ans_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_ans_HL_1_para, norm_ans_HL_2_para, norm_ans_HL_3_para, norm_ans_HL_4_para, norm_ans_U_a, batch_size, true_p_len) ans_scores_vec = T.nnet.softmax( ans_scores_matrix).flatten() #(batch, para_len) ans_loss_neg_likelihood = -T.mean(T.log(ans_scores_vec[ans_indices])) ans_index_vec = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX).flatten() ans_new_index = T.set_subtensor(ans_index_vec[ans_indices], 1.0) ans_prob_batch_posi = ans_scores_vec[ans_new_index.nonzero()] ans_prob_batch_nega = ans_scores_vec[(1.0 - ans_new_index).nonzero()] ans_repeat_posi = T.extra_ops.repeat(ans_prob_batch_posi, ans_prob_batch_nega.shape[0], axis=0) ans_repeat_nega = T.extra_ops.repeat(ans_prob_batch_nega.dimshuffle( 'x', 0), ans_prob_batch_posi.shape[0], axis=0).flatten() ans_loss_rank = T.mean( T.maximum(0.0, margin - ans_repeat_posi + ans_repeat_nega)) ans_loss = ans_loss_neg_likelihood + ans_loss_rank #ans words test test_ans_scores_matrix = add_HLs_2_tensor3( test_word_input4score, norm_ans_HL_1_para, norm_ans_HL_2_para, norm_ans_HL_3_para, norm_ans_HL_4_para, norm_ans_U_a, true_batch_size, true_p_len) test_ans_scores_matrix = test_ans_scores_matrix * para_mask #T.nnet.softmax(test_ans_scores_matrix) #(batch, para_len) ans_gram_1 = test_ans_scores_matrix ans_gram_2 = (test_ans_scores_matrix[:, :-1] + test_ans_scores_matrix[:, 1:] ) / 2.0 #(batch* hidden_size, maxsenlen-1) ans_gram_3 = (test_ans_scores_matrix[:, :-2] + test_ans_scores_matrix[:, 1:-1] + test_ans_scores_matrix[:, 2:] ) / 3.0 #(batch* hidden_size, maxsenlen-2) ans_gram_4 = ( test_ans_scores_matrix[:, :-3] + test_ans_scores_matrix[:, 1:-2] + test_ans_scores_matrix[:, 2:-1] + test_ans_scores_matrix[:, 3:] ) / 4.0 #(batch* hidden_size, maxsenlen-3) ans_gram_5 = ( test_ans_scores_matrix[:, :-4] + test_ans_scores_matrix[:, 1:-3] + test_ans_scores_matrix[:, 2:-2] + test_ans_scores_matrix[:, 3:-1] + test_ans_scores_matrix[:, 4:]) / 5.0 #(batch* hidden_size, maxsenlen-4) ans_word_scores = T.concatenate( [ans_gram_1, ans_gram_2, ans_gram_3, ans_gram_4, ans_gram_5], axis=1) #(batch, hidden_size, maxsenlen-(0+1+2+3+4)) ''' form test spans and masks ''' test_span_word_scores_matrix = word_pair_scores + ans_word_scores #test_span_scores_matrix+ test_spans_mask_1 = para_mask test_spans_mask_2 = para_mask[:, : -1] * para_mask[:, 1:] #(batch* hidden_size, maxsenlen-1) test_spans_mask_3 = para_mask[:, : -2] * para_mask[:, 1: -1] * para_mask[:, 2:] #(batch* hidden_size, maxsenlen-2) test_spans_mask_4 = para_mask[:, : -3] * para_mask[:, 1: -2] * para_mask[:, 2: -1] * para_mask[:, 3:] #(batch* hidden_size, maxsenlen-3) test_spans_mask_5 = para_mask[:, : -4] * para_mask[:, 1: -3] * para_mask[:, 2: -2] * para_mask[:, 3: -1] * para_mask[:, 4:] test_spans_mask = T.concatenate([ test_spans_mask_1, test_spans_mask_2, test_spans_mask_3, test_spans_mask_4, test_spans_mask_5 ], axis=1) #(batch, 5*p_len -) # test_return=T.argmax(test_span_word_scores_matrix, axis=1) #batch T.argmax(test_span_word_scores_matrix*test_spans_mask, axis=1) #batch test_return = T.argmax(test_span_word_scores_matrix * test_spans_mask, axis=1) #batch # params = [embeddings,char_embeddings]+NN_para+[U_a] params = ( [embeddings, char_embeddings, extra_embeddings, distance_embeddings] + CNN_para # +[span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para] + [start_U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] + [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para] + [ans_U_a, ans_HL_1_para, ans_HL_2_para, ans_HL_3_para, ans_HL_4_para]) L2_reg = L2norm_paraList([ embeddings, char_embeddings, extra_embeddings, distance_embeddings, conv_W_1, conv_W_2, conv_W_1_q, conv_W_2_q, conv_W_char, conv_W_3, conv_W_3_q, # conv_W_4, conv_W_5,conv_W_4_q, conv_W_5_q, # span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para, start_U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para, end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para, ans_U_a, ans_HL_1_para, ans_HL_2_para, ans_HL_3_para, ans_HL_4_para ]) #L2_reg = L2norm_paraList(params) cost = word_loss + ans_loss + L2_weight * L2_reg #span_loss+ accumulator = [] for para_i in params: eps_p = numpy.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([ paragraph, questions, span_indices, word_indices, ans_indices, para_mask, q_mask, extra, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len ], cost, updates=updates, on_unused_input='ignore') test_model = theano.function([ paragraph, questions, para_mask, q_mask, extra, char_paragraph, char_questions, char_para_mask, char_q_mask, true_p_len ], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches = train_size / batch_size # remain_train=train_size%batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / test_batch_size # remain_test=test_size%batch_size test_batch_start = list(numpy.arange(n_test_batches) * test_batch_size) + [test_size - test_batch_size] max_F1_acc = 0.0 max_exact_acc = 0.0 cost_i = 0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.Random(200).shuffle(train_ids) iter_accu = 0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_ids[para_id:para_id + batch_size] boundary_labels_batch = train_word_label_list[train_id_batch] ans_label_list = [] for i in range(batch_size): start = boundary_labels_batch[i][0] + i * p_len_limit end = boundary_labels_batch[i][1] + i * p_len_limit ans_label_list += range(start, end + 1) ans_label_list = numpy.asarray(ans_label_list, dtype='int32') cost_i += train_model( train_para_list[train_id_batch], train_Q_list[train_id_batch], train_span_label_list[train_id_batch], boundary_labels_batch, ans_label_list, train_para_mask[train_id_batch], train_Q_mask[train_id_batch], train_para_extras[train_id_batch], train_para_char_list[train_id_batch], train_Q_char_list[train_id_batch], train_para_char_mask[train_id_batch], train_Q_char_mask[train_id_batch], p_len_limit) #print iter if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' print 'Testing...' past_time = time.time() pred_dict = {} q_amount = 0 for test_para_id in test_batch_start: batch_predict_ids = test_model( test_para_list[test_para_id:test_para_id + test_batch_size], test_Q_list[test_para_id:test_para_id + test_batch_size], test_para_mask[test_para_id:test_para_id + test_batch_size], test_Q_mask[test_para_id:test_para_id + test_batch_size], test_para_extras[test_para_id:test_para_id + test_batch_size], test_para_char_list[test_para_id:test_para_id + test_batch_size], test_Q_char_list[test_para_id:test_para_id + test_batch_size], test_para_char_mask[test_para_id:test_para_id + test_batch_size], test_Q_char_mask[test_para_id:test_para_id + test_batch_size], test_p_len_limit) test_para_wordlist_batch = test_para_wordlist_list[ test_para_id:test_para_id + test_batch_size] q_ids_batch = q_idlist[test_para_id:test_para_id + test_batch_size] q_amount += test_batch_size for q in range(test_batch_size): #for each question pred_ans = decode_predict_id( batch_predict_ids[q], test_para_wordlist_batch[q]) q_id = q_ids_batch[q] pred_dict[q_id] = pred_ans # print q_id, test_para_wordlist_batch[q],'\t',pred_ans with codecs.open(rootPath + 'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json', rootPath + 'predictions.txt') if F1_acc > max_F1_acc: max_F1_acc = F1_acc if exact_acc > max_exact_acc: max_exact_acc = exact_acc # if max_exact_acc > max_EM: # store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params) # print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_exact_acc
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, batch_size=20, test_batch_size=200, emb_size=300, hidden_size=300, L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=50.302743615): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'; rng = numpy.random.RandomState(23455) # glove_vocab=set(word2vec.keys()) train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit) train_size=len(train_para_list) if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask): print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)' exit(0) test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test(word2id, para_len_limit, q_len_limit) test_size=len(test_para_list) if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask): print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)' exit(0) rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) # rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in overall_word2id.iteritems()} # word2vec=load_glove() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.lscalar() paragraph = T.imatrix('paragraph') questions = T.imatrix('questions') # labels = T.imatrix('labels') #(batch, para_len) gold_indices= T.ivector() #batch para_mask=T.fmatrix('para_mask') q_mask=T.fmatrix('q_mask') extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3) is_train = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' true_batch_size=paragraph.shape[0] norm_extraF=normalize_matrix(extraF) U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size) paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] U_e1, W_e1, b_e1=create_GRU_para(rng, 3*hidden_size+3, hidden_size) U_e1_b, W_e1_b, b_e1_b=create_GRU_para(rng, 3*hidden_size+3, hidden_size) paragraph_para_e1=[U_e1, W_e1, b_e1, U_e1_b, W_e1_b, b_e1_b] UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size) UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size) Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] # W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size)) # W_a2 = create_ensemble_para(rng, hidden_size, hidden_size) U_a = create_ensemble_para(rng, 1, 2*hidden_size) # 3 extra features # LR_b = theano.shared(value=numpy.zeros((2,), # dtype=theano.config.floatX), # @UndefinedVariable # name='LR_b', borrow=True) HL_paras=[U_a] params = [embeddings]+paragraph_para+Q_para+paragraph_para_e1+HL_paras load_model_from_file(rootPath+'Best_Paras_conv_50.302743614', params) paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen) concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1) paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b) para_reps=paragraph_model.output_tensor #(batch, emb, para_len) # #LSTM # fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) # paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters # paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict) # para_reps=paragraph_model.output_tensor Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength) questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b) questions_reps_tensor=questions_model.output_tensor questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden) questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1) #(batch, para_len, hidden) # #LSTM for questions # fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size) # Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters # questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask, hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict) # questions_reps_tensor=questions_model.output_tensor # def example_in_batch(para_matrix, q_matrix): #assume both are (hidden, len) transpose_para_matrix=para_matrix.T interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len) norm_interaction_matrix=T.nnet.softmax(interaction_matrix) # norm_interaction_matrix=T.maximum(0.0, interaction_matrix) return T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(len, para_len) batch_q_reps, updates = theano.scan(fn=example_in_batch, outputs_info=None, sequences=[para_reps, questions_reps_tensor]) #batch_q_reps (batch, hidden, para_len) #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len) ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), norm_extraF.dimshuffle(0,2,1)], axis=1) #(batch, 3*hidden+3, para_len) para_ensemble_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,U=U_e1,W=W_e1,b=b_e1,Ub=U_e1_b,Wb=W_e1_b,bb=b_e1_b) para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, hidden ,para_len) para_reps_tensor4score = dropout_standard(is_train, para_reps_tensor4score, 0.2, rng) #for span reps span_1=T.concatenate([para_reps_tensor4score, para_reps_tensor4score], axis=1) #(batch, 2*hidden ,para_len) span_2=T.concatenate([para_reps_tensor4score[:,:,:-1], para_reps_tensor4score[:,:,1:]], axis=1) #(batch, 2*hidden ,para_len-1) span_3=T.concatenate([para_reps_tensor4score[:,:,:-2], para_reps_tensor4score[:,:,2:]], axis=1) #(batch, 2*hidden ,para_len-2) span_4=T.concatenate([para_reps_tensor4score[:,:,:-3], para_reps_tensor4score[:,:,3:]], axis=1) #(batch, 2*hidden ,para_len-3) span_5=T.concatenate([para_reps_tensor4score[:,:,:-4], para_reps_tensor4score[:,:,4:]], axis=1) #(batch, 2*hidden ,para_len-4) span_6=T.concatenate([para_reps_tensor4score[:,:,:-5], para_reps_tensor4score[:,:,5:]], axis=1) #(batch, 2*hidden ,para_len-5) span_7=T.concatenate([para_reps_tensor4score[:,:,:-6], para_reps_tensor4score[:,:,6:]], axis=1) #(batch, 2*hidden ,para_len-6) span_8=T.concatenate([para_reps_tensor4score[:,:,:-7], para_reps_tensor4score[:,:,7:]], axis=1) #(batch, 2*hidden ,para_len-7) span_9=T.concatenate([para_reps_tensor4score[:,:,:-8], para_reps_tensor4score[:,:,8:]], axis=1) #(batch, 2*hidden ,para_len-8) span_10=T.concatenate([para_reps_tensor4score[:,:,:-9], para_reps_tensor4score[:,:,9:]], axis=1) #(batch, 2*hidden ,para_len-9) span_11=T.concatenate([para_reps_tensor4score[:,:,:-10], para_reps_tensor4score[:,:,10:]], axis=1) #(batch, 2*hidden ,para_len-10) span_12=T.concatenate([para_reps_tensor4score[:,:,:-11], para_reps_tensor4score[:,:,11:]], axis=1) #(batch, 2*hidden ,para_len-11) span_13=T.concatenate([para_reps_tensor4score[:,:,:-12], para_reps_tensor4score[:,:,12:]], axis=1) #(batch, 2*hidden ,para_len-12) span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7, span_8, span_9, span_10, span_11, span_12, span_13], axis=2) #(batch, 2*hidden, 13*para_len-78) test_span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7], axis=2) #(batch, 2*hidden, 5*para_len-10) #, span_6, span_7 #score each span reps norm_U_a=normalize_matrix(U_a) span_scores_tensor=T.dot(span_reps.dimshuffle(0,2,1), norm_U_a) #(batch, 13*para_len-78, 1) span_scores=T.nnet.softmax(span_scores_tensor.reshape((true_batch_size, 13*paragraph.shape[1]-78))) #(batch, 7*para_len-21) loss=-T.sum(T.log(span_scores[T.arange(true_batch_size), gold_indices])) test_span_scores_tensor=T.dot(test_span_reps.dimshuffle(0,2,1), norm_U_a) #(batch, 7*para_len-21, 1) test_span_scores=T.nnet.softmax(test_span_scores_tensor.reshape((true_batch_size, 7*paragraph.shape[1]-21))) #(batch, 7*para_len-21) test_return=T.argmax(test_span_scores, axis=1) #batch #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] # L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a]) # L2_reg = L2norm_paraList([embeddings]) cost=loss#+ConvGRU_1.error# accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): # print grad_i.type acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #AdaGrad updates.append((acc_i, acc)) # updates=Adam(cost, params, lr=0.0001) train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, extraF, is_train], cost, updates=updates,on_unused_input='ignore') test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF, is_train], test_return, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False #para_list, Q_list, label_list, mask, vocab_size=load_train() n_train_batches=train_size/batch_size # remain_train=train_size%batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_test_batches=test_size/test_batch_size # remain_test=test_size%batch_size test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size] max_F1_acc=0.0 max_exact_acc=0.0 cost_i=0.0 train_ids = range(train_size) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 random.shuffle(train_ids) iter_accu=0 for para_id in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 # haha=para_mask[para_id:para_id+batch_size] # print haha # for i in range(batch_size): # print len(haha[i]) cost_i+= train_model( numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_label_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX), 1) #print iter if iter%10==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' print 'Testing...' past_time = time.time() # writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') # writefile.write('{') pred_dict={} # exact_match=0.0 # F1_match=0.0 q_amount=0 for test_para_id in test_batch_start: batch_predict_ids=test_model( numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'), numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX), 0) # print distribution_matrix test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size] # para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size] q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size] # print 'q_ids_batch:', q_ids_batch # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size] # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size] # para_len=len(test_para_wordlist_list[0]) # if para_len!=len(distribution_matrix[0]): # print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0]) # exit(0) # q_size=len(distribution_matrix) q_amount+=test_batch_size # print q_size # print test_para_word_list # Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size] for q in range(test_batch_size): #for each question # if len(distribution_matrix[q])!=len(test_label_matrix[q]): # print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q]) # else: # ss=len(distribution_matrix[q]) # combine_list=[] # for ii in range(ss): # combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')') # print combine_list # exit(0) # print 'distribution_matrix[q]:',distribution_matrix[q] pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_list[q]) q_id=q_ids_batch[q] pred_dict[q_id]=pred_ans # writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ') # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q]) # q_gold_ans_set=para_gold_ansset_list[q] # # print test_para_wordlist_list[q] # # print Q_list_inword[q] # # print pred_ans.encode('utf8'), q_gold_ans_set # if pred_ans in q_gold_ans_set: # exact_match+=1 # F1=MacroF1(pred_ans, q_gold_ans_set) # F1_match+=F1 with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile: json.dump(pred_dict, outfile) F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt') # F1_acc=F1_match/q_amount # exact_acc=exact_match/q_amount if F1_acc> max_F1_acc: max_F1_acc=F1_acc if exact_acc> max_exact_acc: max_exact_acc=exact_acc if max_exact_acc > max_EM: store_model_to_file(rootPath+'Best_Paras_conv_'+str(max_exact_acc), params) print 'Finished storing best params at:', max_exact_acc print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc # os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt') if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))