def classifier(rng,common_input_l,common_input_r,sents_mask_l, sents_mask_r,drop_conv_W_2_pre,conv_b_2_pre,drop_conv_W_2_gate,conv_b_2_gate,drop_conv_W_2,conv_b_2,drop_conv_W_2_context, conv_b_2_context,labels): conv_layer_2_gate_l = Conv_with_Mask_with_Gate(rng, input_tensor3=common_input_l, mask_matrix = sents_mask_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=drop_conv_W_2_pre, b=conv_b_2_pre, W_gate =drop_conv_W_2_gate, b_gate=conv_b_2_gate ) conv_layer_2_gate_r = Conv_with_Mask_with_Gate(rng, input_tensor3=common_input_r, mask_matrix = sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=drop_conv_W_2_pre, b=conv_b_2_pre, W_gate =drop_conv_W_2_gate, b_gate=conv_b_2_gate ) l_input_4_att = conv_layer_2_gate_l.output_tensor3#conv_layer_2_gate_l.masked_conv_out_sigmoid*conv_layer_2_pre_l.masked_conv_out+(1.0-conv_layer_2_gate_l.masked_conv_out_sigmoid)*common_input_l r_input_4_att = conv_layer_2_gate_r.output_tensor3#conv_layer_2_gate_r.masked_conv_out_sigmoid*conv_layer_2_pre_r.masked_conv_out+(1.0-conv_layer_2_gate_r.masked_conv_out_sigmoid)*common_input_r conv_layer_2 = Conv_for_Pair(rng, origin_input_tensor3=common_input_l, origin_input_tensor3_r = common_input_r, input_tensor3=l_input_4_att, input_tensor3_r = r_input_4_att, mask_matrix = sents_mask_l, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r = (batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]), filter_shape_context=(hidden_size[1], 1,hidden_size[0], 1), W=drop_conv_W_2, b=conv_b_2, W_context=drop_conv_W_2_context, b_context=conv_b_2_context) attentive_sent_embeddings_l_2 = conv_layer_2.attentive_maxpool_vec_l attentive_sent_embeddings_r_2 = conv_layer_2.attentive_maxpool_vec_r # attentive_sent_sumpool_l_2 = conv_layer_2.attentive_sumpool_vec_l # attentive_sent_sumpool_r_2 = conv_layer_2.attentive_sumpool_vec_r HL_layer_1_input = T.concatenate([attentive_sent_embeddings_l_2,attentive_sent_embeddings_r_2, attentive_sent_embeddings_l_2*attentive_sent_embeddings_r_2],axis=1) HL_layer_1_input_size = hidden_size[1]*3#+extra_size#+(maxSentLen*2+10*2)#+hidden_size[1]*3+1 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.nnet.relu) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[0], activation=T.nnet.relu) LR_input_size=HL_layer_1_input_size+2*hidden_size[0] U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] LR_input=T.tanh(T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1)) layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. return loss, LR_para+HL_layer_1.params+HL_layer_2.params, layer_LR.p_y_given_x, layer_LR.errors(labels)
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=100, filter_size=[3,3], maxSentLen=40, hidden_size=[300,300], max_term_len=4, p_mode = 'conc'): model_options = locals().copy() print "model options", model_options seed=1234 np.random.seed(seed) rng = np.random.RandomState(seed) #random seed, control the model generates the same results # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) word2id = load_word2id(root_dic+'LenciBenotto_word2id.pkl') test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id, group_size_list = load_task_hyper_vs_all_with_allDefComb(LenciBenotto_file,maxSentLen, word2id, wordlen=max_term_len) test_sents_l=np.asarray(test_sents_l, dtype='int32') test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX) test_sents_r=np.asarray(test_sents_r, dtype='int32') test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX) test_word1=np.asarray(test_word1, dtype='int32') test_word2=np.asarray(test_word2, dtype='int32') test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX) test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX) test_labels_store=np.asarray(test_labels, dtype='int32') test_extra=np.asarray(test_extra, dtype=theano.config.floatX) # train_size=len(train_labels_store) # dev_size=len(dev_labels_store) test_size=len(test_sents_l) print ' test size: ', test_size vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable # load_model_from_file(root_dic+'Weeds_best_para_init_embeddings', [init_embeddings]) #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() word1_ids = T.imatrix() word2_ids = T.imatrix() word1_mask = T.fmatrix() word2_mask = T.fmatrix() extra = T.fvector() labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size)) embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size)) word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1) word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1) '''create_AttentiveConv_params ''' conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1)) NN_para=[conv_W, conv_b,conv_W_context] ''' attentive convolution function ''' term_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = word1_mask, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1) term_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_input_r, input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_input_r, mask_matrix = word1_mask, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1) def_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_input_l, input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = sents_mask_l, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1) def_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_input_r, input_tensor3=embed_input_l, input_tensor3_r = embed_input_r, mask_matrix = sents_mask_l, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1) if p_mode == 'conc': p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1) p_len = 4*4*hidden_size[1] else: p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0) p_len =4*hidden_size[1] # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1) # HL_input_size=p_len+1+1 # # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh) "form input to LR classifier" LR_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1) LR_input_size=p_len+1+1 # LR_input = HL_layer_1.output # LR_input_size = hidden_size[1] U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b, bias=0.25) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum() params = NN_para+LR_para #[init_embeddings] # load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params) load_model_from_file(root_dic+'LenciBenotto_best_para_0.557286573332', params) ''' 0.552587544259; current ap: 0.574037513126 ap@100 0.918481316424 0.557286573332; current ap: 0.576498645289 ap@100 0.909032657538 ''' test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_test_batches=test_size/batch_size n_test_remain = test_size%batch_size if n_test_remain!=0: test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] else: test_batch_start=list(np.arange(n_test_batches)*batch_size) # max_acc_dev=0.0 max_ap_test=0.0 max_ap_topk_test=0.0 max_f1=0.0 pred_labels =[] probs = [] gold_labels =[] error_sum=0.0 for idd, test_batch_id in enumerate(test_batch_start): # for each test batch pred_i, prob_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_word1[test_batch_id:test_batch_id+batch_size], test_word2[test_batch_id:test_batch_id+batch_size], test_word1_mask[test_batch_id:test_batch_id+batch_size], test_word2_mask[test_batch_id:test_batch_id+batch_size], test_extra[test_batch_id:test_batch_id+batch_size]) # error_sum+=error_i pred_labels+=list(pred_i) probs+=list(prob_i) print len(test_sents_l), len(probs) if n_test_remain !=0: probs = probs[:(len(test_batch_start)-1)*batch_size]+probs[-n_test_remain:] print len(test_sents_l), len(probs) assert len(test_sents_l) == len(probs) assert sum(group_size_list) == len(probs) #max prob in group max_probs = [] prior_size = 0 for i in range(len(group_size_list)): sub_probs = probs[prior_size:prior_size+group_size_list[i]] prior_size += group_size_list[i] max_probs.append(max(sub_probs)) print len(group_size_list),len(max_probs),len(test_labels) assert len(test_labels) == len(max_probs) # test_acc=1.0-error_sum/(len(test_batch_start)) test_ap = apk(test_labels, max_probs, k=len(test_labels)) test_ap_top100 = apk(test_labels, max_probs, k=100) # if test_ap > max_ap_test: # max_ap_test=test_ap # store_model_to_file('/save/wenpeng/datasets/EVALution/HyperDef_label_4ways_conc_test_on_EVA_allDefComb_best_para_'+str(max_ap_test), params) # if test_ap_top100 > max_ap_topk_test: # max_ap_topk_test=test_ap_top100 print '\t\tcurrent ap:', test_ap,'ap@100', test_ap_top100
def evaluate_lenet5(learning_rate=0.01, n_epochs=10, L2_weight=0.000001, extra_size=4, emb_size=300, posi_emb_size=50, batch_size=50, filter_size=[3, 3], maxSentLen=50, hidden_size=300): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SciTailV1_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(all_sentences_l[2], dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(all_sentences_r[2], dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(all_labels[2], dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable posi_rand_values = rng.normal( 0.0, 0.01, (maxSentLen, posi_emb_size)) #generate a matrix by Gaussian distribution posi_embeddings = theano.shared( value=np.array(posi_rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) embed_input_l = embed_input( init_embeddings, sents_ids_l ) #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = embed_input( init_embeddings, sents_ids_r ) #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) '''create_AttentiveConv_params ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[0])) conv_W_posi, conv_b_posi = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size + posi_emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size, 1)) NN_para = [conv_W, conv_b, conv_W_posi, conv_b_posi, conv_W_context] ''' attentive convolution function ''' attentive_conv_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size, 1, emb_size, 1), W=conv_W, b=conv_b, W_posi=conv_W_posi, b_posi=conv_b_posi, W_context=conv_W_context, b_context=conv_b_context, posi_emb_matrix=posi_embeddings, posi_emb_size=posi_emb_size) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r sent_embeddings_l = attentive_conv_layer.maxpool_vec_l sent_embeddings_r = attentive_conv_layer.maxpool_vec_r "form input to LR classifier" LR_input = T.concatenate([ sent_embeddings_l, sent_embeddings_r, sent_embeddings_l * sent_embeddings_r, attentive_sent_embeddings_l, attentive_sent_embeddings_r, attentive_sent_embeddings_l * attentive_sent_embeddings_r ], axis=1) LR_input_size = 6 * hidden_size U_a = create_ensemble_para( rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [init_embeddings, posi_embeddings] + NN_para + LR_para # L2_reg = (init_embeddings**2).sum()+(conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum() cost = loss #+L2_weight*L2_reg updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 max_f1 = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() dev_error_sum = 0.0 for dev_batch_id in dev_batch_start: # for each test batch dev_error_i = dev_model( dev_sents_l[dev_batch_id:dev_batch_id + batch_size], dev_masks_l[dev_batch_id:dev_batch_id + batch_size], dev_sents_r[dev_batch_id:dev_batch_id + batch_size], dev_masks_r[dev_batch_id:dev_batch_id + batch_size], dev_labels_store[dev_batch_id:dev_batch_id + batch_size]) dev_error_sum += dev_error_i dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start)) if dev_acc > max_acc_dev: max_acc_dev = dev_acc print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i test_acc = 1.0 - error_sum / (len(test_batch_start)) if test_acc > max_acc_test: max_acc_test = test_acc store_model_to_file( '/home/wenpeng/workspace/SciTail/src/model_para_' + str(max_acc_test), params) print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test else: print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=100, filter_size=[3, 3], maxSentLen=40, hidden_size=[300, 300], max_term_len=4, p_mode='conc'): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1, all_word2, all_word1_mask, all_word2_mask, all_labels, all_extra, word2id = load_wordnet_hyper_vs_all_with_words( maxlen=maxSentLen, wordlen=max_term_len ) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1, test_word2, test_word1_mask, test_word2_mask, test_labels, test_extra, word2id = load_task_hyper_vs_all_with_words( LenciBenotto_file, maxSentLen, word2id, wordlen=max_term_len) store_word2id(word2id, root_dic + 'LenciBenotto_word2id.pkl') # exit(0) total_size = len(all_sentences_l) hold_test_size = 10000 train_size = total_size - hold_test_size train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32') # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32') test_sents_l = np.asarray(test_sents_l, dtype='int32') train_masks_l = np.asarray(all_masks_l[:train_size], dtype=theano.config.floatX) # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX) test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32') # dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32') test_sents_r = np.asarray(test_sents_r, dtype='int32') train_masks_r = np.asarray(all_masks_r[:train_size], dtype=theano.config.floatX) # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX) test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX) train_word1 = np.asarray(all_word1[:train_size], dtype='int32') train_word2 = np.asarray(all_word2[:train_size], dtype='int32') test_word1 = np.asarray(test_word1, dtype='int32') test_word2 = np.asarray(test_word2, dtype='int32') train_word1_mask = np.asarray(all_word1_mask[:train_size], dtype=theano.config.floatX) train_word2_mask = np.asarray(all_word2_mask[:train_size], dtype=theano.config.floatX) test_word1_mask = np.asarray(test_word1_mask, dtype=theano.config.floatX) test_word2_mask = np.asarray(test_word2_mask, dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[:train_size], dtype='int32') # dev_labels_store=np.asarray(all_labels[1], dtype='int32') # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32') test_labels_store = np.asarray(test_labels, dtype='int32') train_extra = np.asarray(all_extra[:train_size], dtype=theano.config.floatX) test_extra = np.asarray(test_extra, dtype=theano.config.floatX) # train_size=len(train_labels_store) # dev_size=len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' test size: ', test_size vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable store_model_to_file(root_dic + 'LenciBenotto_best_para_init_embeddings', [init_embeddings]) #now, start to build the input form of the model sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() word1_ids = T.imatrix() word2_ids = T.imatrix() word1_mask = T.fmatrix() word2_mask = T.fmatrix() extra = T.fvector() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) embed_input_l = embed_input( init_embeddings, sents_ids_l ) #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = embed_input( init_embeddings, sents_ids_r ) #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_word1 = init_embeddings[word1_ids.flatten()].reshape( (batch_size, word1_ids.shape[1], emb_size)) embed_word2 = init_embeddings[word2_ids.flatten()].reshape( (batch_size, word2_ids.shape[1], emb_size)) word1_embedding = T.sum(embed_word1 * word1_mask.dimshuffle(0, 1, 'x'), axis=1) word2_embedding = T.sum(embed_word2 * word2_mask.dimshuffle(0, 1, 'x'), axis=1) '''create_AttentiveConv_params ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[1], 1, emb_size, 1)) NN_para = [conv_W, conv_b, conv_W_context] ''' attentive convolution function ''' term_vs_term_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1), origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), input_tensor3=embed_word1.dimshuffle(0, 2, 1), input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), mask_matrix=word1_mask, mask_matrix_r=word2_mask, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r=(batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r p_ww = T.concatenate([ tt_embeddings_l, tt_embeddings_r, tt_embeddings_l * tt_embeddings_r, tt_embeddings_l - tt_embeddings_r ], axis=1) term_vs_def_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1), origin_input_tensor3_r=embed_input_r, input_tensor3=embed_word1.dimshuffle(0, 2, 1), input_tensor3_r=embed_input_r, mask_matrix=word1_mask, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r p_wd = T.concatenate([ td_embeddings_l, td_embeddings_r, td_embeddings_l * td_embeddings_r, td_embeddings_l - td_embeddings_r ], axis=1) def_vs_term_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), input_tensor3=embed_input_l, input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), mask_matrix=sents_mask_l, mask_matrix_r=word2_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r p_dw = T.concatenate([ dt_embeddings_l, dt_embeddings_r, dt_embeddings_l * dt_embeddings_r, dt_embeddings_l - dt_embeddings_r ], axis=1) def_vs_def_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r p_dd = T.concatenate([ dd_embeddings_l, dd_embeddings_r, dd_embeddings_l * dd_embeddings_r, dd_embeddings_l - dd_embeddings_r ], axis=1) if p_mode == 'conc': p = T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1) p_len = 4 * 4 * hidden_size[1] else: p = T.max(T.concatenate([ p_ww.dimshuffle('x', 0, 1), p_wd.dimshuffle('x', 0, 1), p_dw.dimshuffle('x', 0, 1), p_dd.dimshuffle('x', 0, 1) ], axis=0), axis=0) p_len = 4 * hidden_size[1] # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1) # HL_input_size=p_len+1+1 # # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh) "form input to LR classifier" LR_input = T.concatenate([ p, cosine_matrix1_matrix2_rowwise(word1_embedding, word2_embedding).dimshuffle(0, 'x'), extra.dimshuffle(0, 'x') ], axis=1) LR_input_size = p_len + 1 + 1 # LR_input = HL_layer_1.output # LR_input_size = hidden_size[1] U_a = create_ensemble_para( rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b, bias=0.25 ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum() params = NN_para + LR_para #[init_embeddings] cost = loss #+L2_weight*L2_reg updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function([ sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids, word2_ids, word1_mask, word2_mask, extra, labels ], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([ sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids, word2_ids, word1_mask, word2_mask, extra, labels ], [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size if n_test_remain != 0: test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] else: test_batch_start = list(np.arange(n_test_batches) * batch_size) # max_acc_dev=0.0 max_ap_test = 0.0 max_ap_topk_test = 0.0 max_f1 = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model( train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_word1[train_id_batch], train_word2[train_id_batch], train_word1_mask[train_id_batch], train_word2_mask[train_id_batch], train_extra[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_labels = [] probs = [] gold_labels = [] error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i, pred_i, prob_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_word1[test_batch_id:test_batch_id + batch_size], test_word2[test_batch_id:test_batch_id + batch_size], test_word1_mask[test_batch_id:test_batch_id + batch_size], test_word2_mask[test_batch_id:test_batch_id + batch_size], test_extra[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i pred_labels += list(pred_i) probs += list(prob_i) if n_test_remain != 0: probs = probs[:(len(test_batch_start) - 1) * batch_size] + probs[-n_test_remain:] assert len(test_labels) == len(probs) # test_acc=1.0-error_sum/(len(test_batch_start)) test_ap = apk(test_labels, probs, k=len(test_labels)) test_ap_top100 = apk(test_labels, probs, k=100) if test_ap > max_ap_test: max_ap_test = test_ap store_model_to_file( root_dic + 'LenciBenotto_best_para_' + str(max_ap_test), params) if test_ap_top100 > max_ap_topk_test: max_ap_topk_test = test_ap_top100 print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=0.0000001, extra_size=4, emb_size=300, batch_size=50, filter_size=[3, 5], maxSentLen=60, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id = load_NYT_dataset( maxSentLen, word2id) train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(test_sents_l, dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(test_sents_r, dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(test_labels, dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) embed_input_l = embed_input( init_embeddings, sents_ids_l ) #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = embed_input( init_embeddings, sents_ids_r ) #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) '''create_AttentiveConv_params ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1)) conv_W_2, conv_b_2 = create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[1])) conv_W_context_2, conv_b_context_2 = create_conv_para( rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1)) NN_para = [ conv_W, conv_b, conv_W_context, conv_W_2, conv_b_2, conv_W_context_2 ] ''' attentive convolution function ''' attentive_conv_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]), filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r attentive_conv_layer_2 = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[1]), filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1), W=conv_W_2, b=conv_b_2, W_context=conv_W_context_2, b_context=conv_b_context_2) attentive_sent_embeddings_l_2 = attentive_conv_layer_2.attentive_maxpool_vec_l attentive_sent_embeddings_r_2 = attentive_conv_layer_2.attentive_maxpool_vec_r #form input to HL layers HL_layer_1_input = T.concatenate([ attentive_sent_embeddings_l, attentive_sent_embeddings_r, attentive_sent_embeddings_l * attentive_sent_embeddings_r, attentive_sent_embeddings_l_2, attentive_sent_embeddings_r_2, attentive_sent_embeddings_l_2 * attentive_sent_embeddings_r_2 ], axis=1) HL_layer_1_input_size = 6 * hidden_size[1] HL_layer_1 = HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[1], activation=T.nnet.relu) HL_layer_2 = HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[1], n_out=hidden_size[1], activation=T.nnet.relu) # LR_input_size=HL_layer_1_input_size+2*hidden_size[0] "form input to LR classifier" LR_input = T.tanh( T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output], axis=1)) LR_input_size = HL_layer_1_input_size + 2 * hidden_size[1] U_a = create_ensemble_para( rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' test_preds = T.argmax(layer_LR.p_y_given_x, axis=1) transfered_preds = T.eq(test_preds, 2) test_error = T.mean(T.neq(transfered_preds, labels)) params = [init_embeddings ] + NN_para + HL_layer_1.params + HL_layer_2.params + LR_para cost = loss updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], [test_error, transfered_preds], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 max_f1 = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_labels = [] gold_labels = [] error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i, pred_labels_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i pred_labels += list(pred_labels_i) gold_labels += list( test_labels_store[test_batch_id:test_batch_id + batch_size]) test_acc = 1.0 - error_sum / (len(test_batch_start)) test_f1 = f1_score_2_binary_list( gold_labels, pred_labels) #, average='binary') if test_acc > max_acc_test: max_acc_test = test_acc if test_f1 > max_f1: max_f1 = test_f1 # store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_five_copies_'+str(max_acc_test), params) print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test, '\t\t test_f1:', test_f1, '\t\tmax F1:', max_f1 print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(term1_str, term2_str): emb_size=300 filter_size=[3,3] maxSentLen=40 hidden_size=[300,300] max_term_len=4 p_mode = 'conc' batch_size = 1 term1_def, source1 = load_concept_def(term1_str) print '\n',term1_str, ':\t', term1_def,'\t', source1,'\n' term2_def, source2 = load_concept_def(term2_str) print '\n',term2_str, ':\t', term2_def, '\t', source2,'\n' # exit(0) word2id = load_word2id('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_word2id.pkl') seed=1234 np.random.seed(seed) rng = np.random.RandomState(seed) #random seed, control the model generates the same results # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len) #minlen, include one label, at least one word in the sentence # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id =load_ACE05_dataset(maxSentLen, word2id) # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id = load_EVAlution_hyper_vs_all_with_words(maxSentLen, word2id, wordlen=max_term_len) test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask, test_extra, word2id = parse_individual_termPair(term1_str, term2_str, term1_def, term2_def, maxSentLen, word2id, wordlen=max_term_len) # total_size = len(all_sentences_l) # hold_test_size = 10000 # train_size = total_size - hold_test_size # train_sents_l=np.asarray(all_sentences_l[:train_size], dtype='int32') # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32') test_sents_l=np.asarray(test_sents_l, dtype='int32') # train_masks_l=np.asarray(all_masks_l[:train_size], dtype=theano.config.floatX) # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX) test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX) # train_sents_r=np.asarray(all_sentences_r[:train_size], dtype='int32') # dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32') test_sents_r=np.asarray(test_sents_r, dtype='int32') # train_masks_r=np.asarray(all_masks_r[:train_size], dtype=theano.config.floatX) # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX) test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX) # train_word1=np.asarray(all_word1[:train_size], dtype='int32') # train_word2=np.asarray(all_word2[:train_size], dtype='int32') test_word1=np.asarray(test_word1, dtype='int32') test_word2=np.asarray(test_word2, dtype='int32') # train_word1_mask=np.asarray(all_word1_mask[:train_size], dtype=theano.config.floatX) # train_word2_mask=np.asarray(all_word2_mask[:train_size], dtype=theano.config.floatX) test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX) test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX) # train_labels_store=np.asarray(all_labels[:train_size], dtype='int32') # dev_labels_store=np.asarray(all_labels[1], dtype='int32') # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32') # test_labels_store=np.asarray(test_labels, dtype='int32') # train_extra=np.asarray(all_extra[:train_size], dtype=theano.config.floatX) test_extra=np.asarray(test_extra, dtype=theano.config.floatX) # train_size=len(train_labels_store) # dev_size=len(dev_labels_store) test_size=len(test_extra) print ' test size: ', len(test_extra) vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable # store_model_to_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_embeddings', [init_embeddings]) # exit(0) #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() word1_ids = T.imatrix() word2_ids = T.imatrix() word1_mask = T.fmatrix() word2_mask = T.fmatrix() extra = T.fvector() # labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size)) embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size)) word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1) word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1) '''create_AttentiveConv_params ''' conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1)) NN_para=[conv_W, conv_b,conv_W_context] ''' attentive convolution function ''' term_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = word1_mask, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1) term_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_word1.dimshuffle(0,2,1), origin_input_tensor3_r = embed_input_r, input_tensor3=embed_word1.dimshuffle(0,2,1), input_tensor3_r = embed_input_r, mask_matrix = word1_mask, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1) def_vs_term_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1), input_tensor3=embed_input_l, input_tensor3_r = embed_word2.dimshuffle(0,2,1), mask_matrix = sents_mask_l, mask_matrix_r = word2_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1) def_vs_def_layer = Conv_for_Pair(rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r = embed_input_r, input_tensor3=embed_input_l, input_tensor3_r = embed_input_r, mask_matrix = sents_mask_l, mask_matrix_r = sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r = (batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1,emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1) if p_mode == 'conc': p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1) p_len = 4*4*hidden_size[1] else: p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0) p_len =4*hidden_size[1] "form input to LR classifier" LR_input = T.concatenate([p,extra.dimshuffle(0,'x')],axis=1) LR_input_size=p_len+1 U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector params = NN_para+LR_para #[init_embeddings] load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_embeddings', [init_embeddings]) load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params) test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... testing' n_test_batches=test_size/batch_size n_test_remain = test_size%batch_size if n_test_remain!=0: test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] else: test_batch_start=list(np.arange(n_test_batches)*batch_size) # max_acc_dev=0.0 # max_ap_test=0.0 # max_ap_topk_test=0.0 # max_f1=0.0 # cost_i=0.0 # train_indices = range(train_size) for idd, test_batch_id in enumerate(test_batch_start): # for each test batch pred_i, prob_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_word1[test_batch_id:test_batch_id+batch_size], test_word2[test_batch_id:test_batch_id+batch_size], test_word1_mask[test_batch_id:test_batch_id+batch_size], test_word2_mask[test_batch_id:test_batch_id+batch_size], test_extra[test_batch_id:test_batch_id+batch_size]) print pred_i, prob_i
def four_way_attentiveConvNet(rng, batch_size, emb_size, hidden_size, filter_size, embed_word1, embed_word2, word1_mask, word2_mask, max_term_len, embed_input_l, sents_mask_l, embed_input_r, sents_mask_r, maxSentLen, conv_W, conv_b, conv_W_context, conv_b_context): term_vs_term_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1), origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), input_tensor3=embed_word1.dimshuffle(0, 2, 1), input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), mask_matrix=word1_mask, mask_matrix_r=word2_mask, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r=(batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r p_ww = T.concatenate([ tt_embeddings_l, tt_embeddings_r, tt_embeddings_l * tt_embeddings_r, tt_embeddings_l - tt_embeddings_r ], axis=1) term_vs_def_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1), origin_input_tensor3_r=embed_input_r, input_tensor3=embed_word1.dimshuffle(0, 2, 1), input_tensor3_r=embed_input_r, mask_matrix=word1_mask, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, max_term_len), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r p_wd = T.concatenate([ td_embeddings_l, td_embeddings_r, td_embeddings_l * td_embeddings_r, td_embeddings_l - td_embeddings_r ], axis=1) def_vs_term_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), input_tensor3=embed_input_l, input_tensor3_r=embed_word2.dimshuffle(0, 2, 1), mask_matrix=sents_mask_l, mask_matrix_r=word2_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, max_term_len), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r p_dw = T.concatenate([ dt_embeddings_l, dt_embeddings_r, dt_embeddings_l * dt_embeddings_r, dt_embeddings_l - dt_embeddings_r ], axis=1) def_vs_def_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[1], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r p_dd = T.concatenate([ dd_embeddings_l, dd_embeddings_r, dd_embeddings_l * dd_embeddings_r, dd_embeddings_l - dd_embeddings_r ], axis=1) return p_ww, p_wd, p_dw, p_dd