def evaluate_lenet5(learning_rate=0.01, n_epochs=4, emb_size=300, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=200, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/' test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt' output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_concMT_BBN_NI_epoch4.json' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) word2id = {} # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types( word2id, maxSentLen) train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others( word2id, maxSentLen) test_sents, test_masks, test_lines, word2id = load_official_testData_il_and_MT( word2id, maxSentLen, test_file_path) label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_p1_sents = np.asarray(train_p1_sents, dtype='int32') train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX) train_p1_labels = np.asarray(train_p1_labels, dtype='int32') train_p1_size = len(train_p1_labels) train_p2_sents = np.asarray(train_p2_sents, dtype='int32') train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX) train_p2_labels = np.asarray(train_p2_labels, dtype='int32') train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32') train_p2_size = len(train_p2_labels) ''' combine train_p1 and train_p2 ''' train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0) train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0) train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0) train_size = train_p1_size + train_p2_size test_sents = np.asarray(test_sents, dtype='int32') test_masks = np.asarray(test_masks, dtype=theano.config.floatX) # test_labels=np.asarray(all_labels[2], dtype='int32') test_size = len(test_sents) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + '100k-ENG-multicca.300.ENG.txt', emb_root + '100k-IL9-multicca.d300.IL9.txt' ], 300) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 other_labels = T.imatrix() #batch*4 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] ''' multi-CNN ''' conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l ''' cross-DNN-dataless ''' #first map label emb into hidden space HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, emb_size, hidden_size[0]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=bow_des, n_in=emb_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) des_rep_hidden = HL_layer_1.output #(type_size, hidden_size) dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot( des_rep_hidden.T)) #(batch_size, type_size) dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) acnn_LR_input = T.concatenate([ dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix, top_k_score_matrix, sent_embeddings, sent_embeddings2, gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb ], axis=1) acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12) acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size, 16) acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b] acnn_other_layer_LR = LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=16, W=acnn_other_U_a, b=acnn_other_LR_b) acnn_other_prob_matrix = T.nnet.softmax( acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4))) acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape( (batch_size, 4, 4)) acnn_other_prob = acnn_other_prob_tensor3[ T.repeat(T.arange(batch_size), 4), T.tile(T.arange(4), (batch_size)), other_labels.flatten()] acnn_other_field_loss = -T.mean(T.log(acnn_other_prob)) params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params # put all model parameters together cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() + (conv_att_W**2).sum() + (conv_att_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) other_paras = params + acnn_other_LR_para cost_other = cost + acnn_other_field_loss other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate) ''' testing ''' ensemble_NN_scores = acnn_score_matrix #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = ensemble_NN_scores #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) ''' test for other fields ''' sum_tensor3 = acnn_other_prob_tensor3 #(batch, 4, 3) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_p1_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') train_p2_model = theano.function([ sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask, other_labels ], cost_other, updates=other_updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], [binarize_prob, ensemble_scores, sum_tensor3], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_train_p2_batches = train_p2_size / batch_size train_p2_batch_start = list(np.arange(n_train_p2_batches) * batch_size) + [train_p2_size - batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] train_p2_batch_start_set = set(train_p2_batch_start) # max_acc_dev=0.0 # max_meanf1_test=0.0 # max_weightf1_test=0.0 train_indices = range(train_size) train_p2_indices = range(train_p2_size) cost_i = 0.0 other_cost_i = 0.0 min_mean_frame = 100.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) random.Random(100).shuffle(train_p2_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_p1_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) if batch_id in train_p2_batch_start_set: train_p2_id_batch = train_p2_indices[batch_id:batch_id + batch_size] other_cost_i += train_p2_model( train_p2_sents[train_p2_id_batch], train_p2_masks[train_p2_id_batch], train_p2_labels[train_p2_id_batch], label_sent, label_mask, train_p2_other_labels[train_p2_id_batch]) # else: # random_batch_id = random.choice(train_p2_batch_start) # train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size] # other_cost_i+=train_p2_model( # train_p2_sents[train_p2_id_batch], # train_p2_masks[train_p2_id_batch], # train_p2_labels[train_p2_id_batch], # label_sent, # label_mask, # train_p2_other_labels[train_p2_id_batch] # ) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), str( other_cost_i / iter), 'uses ', (time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_types = [] pred_confs = [] pred_others = [] for i, test_batch_id in enumerate( test_batch_start): # for each test batch pred_types_i, pred_conf_i, pred_fields_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) if i < len(test_batch_start) - 1: pred_types.append(pred_types_i) pred_confs.append(pred_conf_i) pred_others.append(pred_fields_i) else: pred_types.append(pred_types_i[-n_test_remain:]) pred_confs.append(pred_conf_i[-n_test_remain:]) pred_others.append(pred_fields_i[-n_test_remain:]) pred_types = np.concatenate(pred_types, axis=0) pred_confs = np.concatenate(pred_confs, axis=0) pred_others = np.concatenate(pred_others, axis=0) mean_frame = generate_2018_official_output( test_lines, output_file_path, pred_types, pred_confs, pred_others, min_mean_frame) if mean_frame < min_mean_frame: min_mean_frame = mean_frame print '\t\t\t test over, min_mean_frame:', min_mean_frame print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3,5], maxSentLen=100, hidden_size=[300,300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/' seed=1234 np.random.seed(seed) rng = np.random.RandomState(seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) all_sentences, all_masks, all_labels, word2id=load_BBN_multi_labels_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_sents=np.asarray(all_sentences[0], dtype='int32') train_masks=np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels=np.asarray(all_labels[0], dtype='int32') train_size=len(train_labels) dev_sents=np.asarray(all_sentences[1], dtype='int32') dev_masks=np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels=np.asarray(all_labels[1], dtype='int32') dev_size=len(dev_labels) test_sents=np.asarray(all_sentences[2], dtype='int32') test_masks=np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels=np.asarray(all_labels[2], dtype='int32') test_size=len(test_labels) comb_sents = np.concatenate([train_sents,test_sents], axis=0) comb_masks = np.concatenate([train_masks,test_masks], axis=0) comb_labels = np.asarray([0]*train_size+[1]*test_size, dtype='int32') comb_size = len(comb_labels) vocab_size= len(word2id)+1 # add one zero pad index rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_fasttext_multiple_word2vec_given_file([emb_root+'wiki.en.vec',emb_root+'mono-lingual-il5-xinli.vec'], 300) rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix=T.imatrix('sents_id_matrix') sents_mask=T.fmatrix('sents_mask') labels=T.imatrix('labels') #batch*12 domain_labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input=embeddings[sents_id_matrix.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input*sents_mask.dimshuffle(0,'x',1),axis=2) # bow_mean_emb = bow_emb/T.sum(sents_mask,axis=1).dimshuffle(0,'x') conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) NN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_model = Conv_with_Mask(rng, input_tensor3=common_input, mask_matrix = sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings=conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask(rng, input_tensor3=common_input, mask_matrix = sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2=conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size ''' adversarial ''' domain_input = T.concatenate([sent_embeddings,sent_embeddings2, bow_emb], axis=1) domain_input_size = hidden_size[0]*2+emb_size HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(rng, domain_input_size, hidden_size[0]) HL_layer_2_W, HL_layer_2_b = create_HiddenLayer_para(rng, hidden_size[0], hidden_size[0]) adver_HL_para = [HL_layer_1_W, HL_layer_1_b, HL_layer_2_W, HL_layer_2_b] HL_layer_1=HiddenLayer(rng, input=domain_input, n_in=domain_input_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[0], W=HL_layer_2_W, b=HL_layer_2_b, activation=T.tanh) disc_para_W, disc_para_b = create_LR_para(rng,hidden_size[0],2) conf_para_W, conf_para_b = create_LR_para(rng,hidden_size[0],2) conf_para_W2, conf_para_b2 = create_LR_para(rng,hidden_size[0],2) adver_LR_para = [disc_para_W, disc_para_b, conf_para_W, conf_para_b,conf_para_W2, conf_para_b2] disc_layer_LR=LogisticRegression(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=2, W=disc_para_W, b=disc_para_b) conf_layer_LR=LogisticRegression(rng, input=HL_layer_2.output, n_in=hidden_size[0], n_out=2, W=conf_para_W, b=conf_para_b) conf2_layer_LR=LogisticRegression(rng, input=HL_layer_2.output, n_in=hidden_size[0], n_out=2, W=conf_para_W2, b=conf_para_b2) disc_loss = disc_layer_LR.negative_log_likelihood(domain_labels) conf_loss = conf_layer_LR.negative_log_likelihood_specific_label(0) conf_loss2 = conf2_layer_LR.negative_log_likelihood_specific_label(1) adver_loss = disc_loss+conf_loss+conf_loss2 adver_para = adver_HL_para+adver_LR_para +NN_para adver_updates = Gradient_Cost_Para(adver_loss,adver_para, learning_rate) ''' SF classification ''' LR_input = HL_layer_2.output#T.concatenate([sent_embeddings,sent_embeddings2, bow_emb], axis=1) LR_input_size = hidden_size[0] #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para(rng, 12, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((12,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(layer_LR.before_softmax) #batch * 12 prob_pos = T.where( labels < 1, 1.0-score_matrix, score_matrix) loss = -T.mean(T.log(prob_pos)) # loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = NN_para+adver_HL_para+LR_para # put all model parameters together cost=loss+1e-4*((conv_W**2).sum()+(conv_W2**2).sum()) updates = Gradient_Cost_Para(cost,params, learning_rate) ''' testing ''' binarize_prob = T.where(score_matrix > 0.3, 1, 0) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') comb_model = theano.function([sents_id_matrix, sents_mask, domain_labels], adver_loss, updates=adver_updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_id_matrix, sents_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_train_batches=train_size/batch_size train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_comb_batches=comb_size/batch_size comb_batch_start=list(np.arange(n_comb_batches)*batch_size)+[comb_size-batch_size] n_test_batches=test_size/batch_size test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] # max_acc_dev=0.0 max_meanf1_test=0.0 max_weightf1_test=0.0 train_indices = range(train_size) comb_indices = range(comb_size) cost_i=0.0 adver_cost_i=0.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) random.Random(100).shuffle(comb_indices) iter_accu=0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_indices[batch_id:batch_id+batch_size] cost_i+= train_model( train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch]) comb_id_batch = comb_indices[batch_id:batch_id+batch_size] adver_cost_i+= comb_model( comb_sents[comb_id_batch], comb_masks[comb_id_batch], comb_labels[comb_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter%20==0: print 'Epoch ', epoch, 'iter ', iter, ' average cost: ', cost_i/iter , adver_cost_i/iter , 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() error_sum=0.0 all_pred_labels = [] all_gold_labels = [] for test_batch_id in test_batch_start: # for each test batch pred_labels=test_model( test_sents[test_batch_id:test_batch_id+batch_size], test_masks[test_batch_id:test_batch_id+batch_size]) gold_labels = test_labels[test_batch_id:test_batch_id+batch_size] # print 'pred_labels:', pred_labels # print 'gold_labels;', gold_labels all_pred_labels.append(pred_labels) all_gold_labels.append(gold_labels) all_pred_labels = np.concatenate(all_pred_labels) all_gold_labels = np.concatenate(all_gold_labels) test_mean_f1, test_weight_f1 =average_f1_two_array_by_col(all_pred_labels, all_gold_labels) if test_weight_f1 > max_weightf1_test: max_weightf1_test=test_weight_f1 if test_mean_f1 > max_meanf1_test: max_meanf1_test=test_mean_f1 print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=700, il5_emb_size=300, train_ratio=0.8, emb_size=300, batch_size=50, filter_size=[3, 5], max_il5_phrase_len=5, hidden_size=300): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results # srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/' # en_word2vec=load_fasttext_word2vec_given_file(root+'wiki.en.vec') # en_word2vec = load_fasttext_word2vec_given_file( '/save/wenpeng/datasets/word2vec_words_300d.txt', 300) il5_word2vec = load_fasttext_word2vec_given_file( root + 'il5_300d_word2vec.txt', 300) english_vocab = set(en_word2vec.keys()) il5_vocab = set(il5_word2vec.keys()) source_ids, source_masks, target_ids, il5_word2id, english_word2id = load_trainingdata_il5( root, english_vocab, il5_vocab, max_il5_phrase_len) # print len(english_vocab) # print len(english_word2id) # print set(english_word2id.keys()) - english_vocab assert set(english_word2id.keys()).issubset(english_vocab) assert set(il5_word2id.keys()).issubset(il5_vocab) data_size = len(target_ids) train_size = int(data_size * train_ratio) dev_size = data_size - train_size print 'trainin size: ', train_size, ' dev_size: ', dev_size # all_sentences, all_masks, all_labels, word2id=load_BBN_multi_labels_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_sents = np.asarray(source_ids[:train_size], dtype='int32') train_masks = np.asarray(source_masks[:train_size], dtype=theano.config.floatX) train_target = np.asarray(target_ids[:train_size], dtype='int32') dev_sents = np.asarray(source_ids[-dev_size:], dtype='int32') dev_masks = np.asarray(source_masks[-dev_size:], dtype=theano.config.floatX) dev_target = np.asarray(target_ids[-dev_size:], dtype='int32') en_vocab_size = len(english_word2id) en_rand_values = rng.normal( 0.0, 0.01, (en_vocab_size, emb_size)) #generate a matrix by Gaussian distribution en_id2word = {y: x for x, y in english_word2id.iteritems()} en_rand_values = load_word2vec_to_init(en_rand_values, en_id2word, en_word2vec) en_embeddings = theano.shared( value=np.array(en_rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable il5_vocab_size = len(il5_word2id) + 1 # add one zero pad index il5_rand_values = rng.normal( 0.0, 0.01, (il5_vocab_size, il5_emb_size)) #generate a matrix by Gaussian distribution il5_id2word = {y: x for x, y in il5_word2id.iteritems()} il5_rand_values = load_word2vec_to_init(il5_rand_values, il5_id2word, il5_word2vec) il5_embeddings = theano.shared( value=np.array(il5_rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable # source_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable # target_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model batch_ids = T.imatrix() #(batch, maxlen) batch_masks = T.fmatrix() #(batch, maxlen) batch_targets = T.ivector() batch_test_source = T.fmatrix() #(batch, emb_size) input_batch = il5_embeddings[batch_ids.flatten()].reshape( (batch_size, max_il5_phrase_len, il5_emb_size)) #(batch, emb_size) masked_input_batch = T.sum(input_batch * batch_masks.dimshuffle(0, 1, 'x'), axis=1) #(batch, emb_size) # masked_input_batch = masked_input_batch / T.sum(batch_masks,axis=1).dimshuffle(0,'x') target_embs = en_embeddings[batch_targets] HL_layer_W1, HL_layer_b1 = create_HiddenLayer_para(rng, hidden_size, hidden_size) HL_layer_W2, HL_layer_b2 = create_HiddenLayer_para(rng, hidden_size, hidden_size) HL_layer_W3, HL_layer_b3 = create_HiddenLayer_para(rng, hidden_size, hidden_size) HL_layer_params = [ HL_layer_W3, HL_layer_b3 ] #][HL_layer_W1, HL_layer_b1, HL_layer_W2, HL_layer_b2, HL_layer_W3, HL_layer_b3] #doc, by pos # HL_layer_1=HiddenLayer(rng, input=masked_input_batch, n_in=il5_emb_size, n_out=emb_size, W=HL_layer_W1, b=HL_layer_b1, activation=T.nnet.relu) # HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=emb_size, n_out=emb_size, W=HL_layer_W2, b=HL_layer_b2, activation=T.nnet.relu) HL_layer_3 = HiddenLayer(rng, input=masked_input_batch, n_in=emb_size, n_out=emb_size, W=HL_layer_W3, b=HL_layer_b3, activation=T.tanh) batch_raw_pred = HL_layer_3.output # batch_pred = batch_raw_pred/T.sqrt(1e-8+T.sum(batch_raw_pred**2, axis=1)).dimshuffle(0,'x') batch_cosine = T.mean( cosine_row_wise_twoMatrix( batch_raw_pred, target_embs)) #T.mean(T.sum(batch_pred *target_embs, axis=1)) batch_distance = T.mean( T.sqrt(1e-8 + T.sum((batch_raw_pred - target_embs)**2, axis=1))) cos_loss = -T.log(1.0 + batch_cosine) #1.0-batch_cosine loss = -T.log(1.0 / (1.0 + batch_distance)) params = HL_layer_params # put all model parameters together cost = cos_loss + loss #+1e-4*((HL_layer_W3**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) ''' testing ''' #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([batch_ids, batch_masks, batch_targets], loss, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([batch_ids, batch_masks, batch_targets], batch_cosine, allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] # n_dev_batches=dev_size/batch_size # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches = dev_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [dev_size - batch_size] train_indices = range(train_size) cost_i = 0.0 max_cosine = 0.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_target[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() test_loss = 0.0 for test_batch_id in test_batch_start: # for each test batch test_loss_i = test_model( dev_sents[test_batch_id:test_batch_id + batch_size], dev_masks[test_batch_id:test_batch_id + batch_size], dev_target[test_batch_id:test_batch_id + batch_size]) test_loss += test_loss_i test_loss /= len(test_batch_start) if test_loss > max_cosine: max_cosine = test_loss print '\t\t\t\t\t\t\t\tcurrent mean_cosin:', test_loss, ' max cosine: ', max_cosine print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0, decay=0.95): maxSentLength=max_s_length+2*(window_width-1) maxDocLength=max_d_length+2*(window_width-1) model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/'; rng = numpy.random.RandomState(23455) train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DQAAAA(rootPath+'vocab_DQAAAA.txt', rootPath+'mc500.train.tsv_standardlized.txt_DQAAAA.txt', rootPath+'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test [train_data_D, train_data_Q, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4, train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4, train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4]=train_data [test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4, test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4]=test_data n_train_batches=train_size/batch_size n_test_batches=test_size/batch_size train_batch_start=list(numpy.arange(n_train_batches)*batch_size) test_batch_start=list(numpy.arange(n_test_batches)*batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DQAAAA_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings=theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum=0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A1= T.lvector() index_A2= T.lvector() index_A3= T.lvector() index_A4= T.lvector() # y = T.lvector() len_D=T.lscalar() len_D_s=T.lvector() len_Q=T.lscalar() len_A1=T.lscalar() len_A2=T.lscalar() len_A3=T.lscalar() len_A4=T.lscalar() left_D=T.lscalar() left_D_s=T.lvector() left_Q=T.lscalar() left_A1=T.lscalar() left_A2=T.lscalar() left_A3=T.lscalar() left_A4=T.lscalar() right_D=T.lscalar() right_D_s=T.lvector() right_Q=T.lscalar() right_A1=T.lscalar() right_A2=T.lscalar() right_A3=T.lscalar() right_A4=T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words=(emb_size,window_width) filter_sents=(nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1), 'layer0_D_input')#.dimshuffle(0, 'x', 1, 2) layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_Q_input')#.dimshuffle(0, 'x', 1, 2) layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_A1_input')#.dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2) layer0_A4_input = embeddings[index_A4.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2) U, W, b=create_GRU_para(rng, emb_size, nkerns[0]) layer0_para=[U, W, b] # conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) # layer2_para=[conv2_W, conv2_b] # high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # highW_para=[high_W, high_b] #load_model(params) layer0_D = GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D,:,:], lefts=left_D_s[left_D:-right_D], rights=right_D_s[left_D:-right_D], hidden_dim=nkerns[0], U=U,W=W,b=b) layer0_Q = GRU_Matrix_Input(X=layer0_Q_input[:,left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A1 = GRU_Matrix_Input(X=layer0_A1_input[:,left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A2 = GRU_Matrix_Input(X=layer0_A2_input[:,left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A3 = GRU_Matrix_Input(X=layer0_A3_input[:,left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_A4 = GRU_Matrix_Input(X=layer0_A4_input[:,left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1) layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output') layer0_Q_output=debug_print(layer0_Q.output_vector_mean, 'layer0_Q.output') layer0_A1_output=debug_print(layer0_A1.output_vector_mean, 'layer0_A1.output') layer0_A2_output=debug_print(layer0_A2.output_vector_mean, 'layer0_A2.output') layer0_A3_output=debug_print(layer0_A3.output_vector_mean, 'layer0_A3.output') layer0_A4_output=debug_print(layer0_A4.output_vector_mean, 'layer0_A4.output') #before reasoning, do a GRU for doc: d U_d, W_d, b_d=create_GRU_para(rng, nkerns[0], nkerns[0]) layer_d_para=[U_d, W_d, b_d] layer_D_GRU = GRU_Matrix_Input(X=layer0_D_output, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d,W=W_d,b=b_d,bptt_truncate=-1) #Reasoning Layer 1 repeat_Q=debug_print(T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0],1)), maxDocLength, axis=1)[:,:layer_D_GRU.output_matrix.shape[1]], 'repeat_Q') input_DNN=debug_print(T.concatenate([layer_D_GRU.output_matrix,repeat_Q], axis=0).transpose(), 'input_DNN')#each row is an example output_DNN1=HiddenLayer(rng, input=input_DNN, n_in=nkerns[0]*2, n_out=nkerns[0]) output_DNN2=HiddenLayer(rng, input=output_DNN1.output, n_in=nkerns[0], n_out=nkerns[0]) DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out') U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0]) layer_pooling_para=[U_p, W_p, b_p] pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1) translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1') #before reasoning, do a GRU for doc: d2 U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0]) layer_d2_para=[U_d2, W_d2, b_d2] layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1) #Reasoning Layer 2 repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1') input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0]) output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0]) DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2') U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0]) layer_pooling_para2=[U_p2, W_p2, b_p2] pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1) translated_Q2=debug_print(pooling2.output_vector_max, 'translated_Q2') QA1=T.concatenate([translated_Q2, layer0_A1_output], axis=0) QA2=T.concatenate([translated_Q2, layer0_A2_output], axis=0) QA3=T.concatenate([translated_Q2, layer0_A3_output], axis=0) QA4=T.concatenate([translated_Q2, layer0_A4_output], axis=0) W_HL,b_HL=create_HiddenLayer_para(rng, n_in=nkerns[0]*2, n_out=1) match_params=[W_HL,b_HL] QA1_match=HiddenLayer(rng, input=QA1, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL) QA2_match=HiddenLayer(rng, input=QA2, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL) QA3_match=HiddenLayer(rng, input=QA3, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL) QA4_match=HiddenLayer(rng, input=QA4, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL) # simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1') # simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4') simi_overall_level1=debug_print(QA1_match.output[0], 'simi_overall_level1') simi_overall_level2=debug_print(QA2_match.output[0], 'simi_overall_level2') simi_overall_level3=debug_print(QA3_match.output[0], 'simi_overall_level3') simi_overall_level4=debug_print(QA4_match.output[0], 'simi_overall_level4') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) #only use overall_simi cost=T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level3-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level4-simi_overall_level1) # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) posi_simi=simi_overall_level1 nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4]) # #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_1 # nega_simi=T.max([simi_2, simi_3, simi_4]) L2_reg =debug_print((U**2).sum()+(W**2).sum() +(U_p**2).sum()+(W_p**2).sum() +(U_p2**2).sum()+(W_p2**2).sum() +(U_d**2).sum()+(W_d**2).sum() +(U_d2**2).sum()+(W_d2**2).sum() +(output_DNN1.W**2).sum()+(output_DNN2.W**2).sum() +(output_DNN3.W**2).sum()+(output_DNN4.W**2).sum() +(W_HL**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost=debug_print(cost+L2_weight*L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function([index], [cost, posi_simi, nega_simi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index], right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') params = layer0_para+output_DNN1.params+output_DNN2.params+output_DNN3.params+output_DNN4.params+layer_pooling_para+layer_pooling_para2+match_params+layer_d_para+layer_d2_para # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6))) # updates.append((acc_i, acc)) def AdaDelta_updates(parameters,gradients,rho,eps): # create variables to store intermediate updates gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] # calculates the new "average" delta for the next iteration gradients_sq_new = [ rho*g_sq + (1-rho)*(g**2) for g_sq,g in zip(gradients_sq,gradients) ] # calculates the step in direction. The square root is an approximation to getting the RMS for the average value deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ] # calculates the new "average" deltas for the next step. deltas_sq_new = [ rho*d_sq + (1-rho)*(d**2) for d_sq,d in zip(deltas_sq,deltas) ] # Prepare it as a list f gradient_sq_updates = zip(gradients_sq,gradients_sq_new) deltas_sq_updates = zip(deltas_sq,deltas_sq_new) parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ] return gradient_sq_updates + deltas_sq_updates + parameters_updates updates=AdaDelta_updates(params, grads, decay, 1e-6) train_model = theano.function([index], [cost, posi_simi, nega_simi], updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function([index], [cost, posi_simi, nega_simi], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc=0.0 best_epoch=0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 # shuffle(train_batch_start)#shuffle training data corr_train=0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index +1 sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) ) sys.stdout.flush() minibatch_index=minibatch_index+1 cost_average, posi_simi, nega_simi= train_model(batch_start) if posi_simi>nega_simi: corr_train+=1 if iter % n_train_batches == 0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train*100.0/train_size) if iter % validation_frequency == 0: corr_test=0 for i in test_batch_start: cost, posi_simi, nega_simi=test_model(i) if posi_simi>nega_simi: corr_test+=1 #write_file.close() #test_score = numpy.mean(test_losses) test_acc=corr_test*1.0/test_size #test_acc=1-test_score print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches,test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better=False if test_acc > max_acc: max_acc=test_acc best_epoch=epoch find_better=True print '\t\t\ttest_acc:', test_acc, 'max:', max_acc,'(at',best_epoch,')' if find_better==True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=70, filter_size=[3, 1], maxSentLen=70, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(all_sentences_l[2], dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(all_sentences_r[2], dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(all_labels[2], dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered" "words keep random initialization" rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' 'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in' 'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) ' embed_input_l = init_embeddings[sents_ids_l.flatten( )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) '''create parameters for attentive convolution function ''' gate_filter_shape = (emb_size, 1, emb_size, 1) conv_W_pre, conv_b_pre = create_conv_para(rng, filter_shape=gate_filter_shape) conv_W_gate, conv_b_gate = create_conv_para(rng, filter_shape=gate_filter_shape) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W2_context, conv_b2_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, conv_W_context, conv_W_pre, conv_b_pre, conv_W_gate, conv_b_gate, conv_W2, conv_b2, conv_W2_context ] "A gated convolution layer to form more expressive word representations in each sentence" "input tensor3 (batch_size, emb_size, sen_length), output tensor3 (batch_size, emb_size, sen_length)" conv_layer_gate_l = Conv_with_Mask_with_Gate( rng, input_tensor3=embed_input_l, mask_matrix=sents_mask_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_pre, b=conv_b_pre, W_gate=conv_W_gate, b_gate=conv_b_gate) conv_layer_gate_r = Conv_with_Mask_with_Gate( rng, input_tensor3=embed_input_r, mask_matrix=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_pre, b=conv_b_pre, W_gate=conv_W_gate, b_gate=conv_b_gate) ''' attentive convolution function, two sizes of filter_width 3&1 are used. Multi-channel ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=conv_layer_gate_l.output_tensor3, input_tensor3_r=conv_layer_gate_r.output_tensor3, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=conv_layer_gate_l.output_tensor3, input_tensor3_r=conv_layer_gate_r.output_tensor3, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W2, b=conv_b2, W_context=conv_W2_context, b_context=conv_b2_context) attentive_sent_embeddings_l2 = attentive_conv_layer2.attentive_maxpool_vec_l attentive_sent_embeddings_r2 = attentive_conv_layer2.attentive_maxpool_vec_r "Batch normalization for the four output sentence representation vectors" gamma = theano.shared(np.asarray(rng.uniform( low=-1.0 / math.sqrt(hidden_size[0]), high=1.0 / math.sqrt(hidden_size[0]), size=(hidden_size[0])), dtype=theano.config.floatX), borrow=True) beta = theano.shared(np.zeros((hidden_size[0]), dtype=theano.config.floatX), borrow=True) bn_params = [gamma, beta] bn_attentive_sent_embeddings_l = batch_normalization( inputs=attentive_sent_embeddings_l, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_l.mean((0, ), keepdims=True), std=attentive_sent_embeddings_l.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_r = batch_normalization( inputs=attentive_sent_embeddings_r, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_r.mean((0, ), keepdims=True), std=attentive_sent_embeddings_r.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_l2 = batch_normalization( inputs=attentive_sent_embeddings_l2, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_l2.mean((0, ), keepdims=True), std=attentive_sent_embeddings_l2.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_r2 = batch_normalization( inputs=attentive_sent_embeddings_r2, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_r2.mean((0, ), keepdims=True), std=attentive_sent_embeddings_r2.std((0, ), keepdims=True), mode='low_mem') "Before logistic regression layer, we insert a hidden layer. Now form input to HL classifier" HL_layer_1_input = T.concatenate([ bn_attentive_sent_embeddings_l, bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l + bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l * bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l2, bn_attentive_sent_embeddings_r2, bn_attentive_sent_embeddings_l2 + bn_attentive_sent_embeddings_r2, bn_attentive_sent_embeddings_l2 * bn_attentive_sent_embeddings_r2 ], axis=1) HL_layer_1_input_size = 8 * hidden_size[0] "Create hidden layer parameters" HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, HL_layer_1_input_size, hidden_size[1]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] "Hidden Layer and batch norm to its output again" HL_layer_1 = HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[1], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) gamma_HL = theano.shared(np.asarray(rng.uniform( low=-1.0 / math.sqrt(hidden_size[1]), high=1.0 / math.sqrt(hidden_size[1]), size=(hidden_size[1])), dtype=theano.config.floatX), borrow=True) beta_HL = theano.shared(np.zeros((hidden_size[1]), dtype=theano.config.floatX), borrow=True) bn_params_HL = [gamma_HL, beta_HL] bn_HL_output = batch_normalization(inputs=HL_layer_1.output, gamma=gamma_HL, beta=beta_HL, mean=HL_layer_1.output.mean( (0, ), keepdims=True), std=HL_layer_1.output.std( (0, ), keepdims=True), mode='low_mem') "Form input to LR classifier" LR_input = T.concatenate([HL_layer_1_input, bn_HL_output], axis=1) LR_input_size = HL_layer_1_input_size + hidden_size[1] U_a = create_ensemble_para(rng, 3, LR_input_size) # (input_size, 3) LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] "Logistic Regression layer" layer_LR = LogisticRegression( rng, input=normalize_matrix_col_wise(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [ init_embeddings ] + NN_para + LR_para + bn_params + HL_layer_1_params + bn_params_HL cost = loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) if (epoch == 1 and iter % 1000 == 0) or (epoch >= 2 and iter % 5 == 0): print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() dev_error_sum = 0.0 for dev_batch_id in dev_batch_start: # for each test batch dev_error_i = dev_model( dev_sents_l[dev_batch_id:dev_batch_id + batch_size], dev_masks_l[dev_batch_id:dev_batch_id + batch_size], dev_sents_r[dev_batch_id:dev_batch_id + batch_size], dev_masks_r[dev_batch_id:dev_batch_id + batch_size], dev_labels_store[dev_batch_id:dev_batch_id + batch_size]) dev_error_sum += dev_error_i dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start)) if dev_acc > max_acc_dev: max_acc_dev = dev_acc print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev ''' best dev model, test ''' error_sum = 0.0 for test_batch_id in test_batch_start: # for each test batch error_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i test_acc = 1.0 - error_sum / (len(test_batch_start)) if test_acc > max_acc_test: max_acc_test = test_acc print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test else: print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=3, maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200, L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0, decay=0.95): maxSentLength = max_s_length + 2 * (window_width - 1) maxDocLength = max_d_length + 2 * (window_width - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/' rng = numpy.random.RandomState(23455) train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DQAAAA( rootPath + 'vocab_DQAAAA.txt', rootPath + 'mc500.train.tsv_standardlized.txt_DQAAAA.txt', rootPath + 'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length, maxSentLength, maxDocLength) #vocab_size contain train, dev and test [ train_data_D, train_data_Q, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, train_Length_D, train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4, train_leftPad_D, train_leftPad_D_s, train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4, train_rightPad_D, train_rightPad_D_s, train_rightPad_Q, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4 ] = train_data [ test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, test_Length_D, test_Length_D_s, test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D, test_leftPad_D_s, test_leftPad_Q, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4, test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4 ] = test_data n_train_batches = train_size / batch_size n_test_batches = test_size / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int64') # indices_train_r=T.cast(indices_train_r, 'int64') # indices_test_l=T.cast(indices_test_l, 'int64') # indices_test_r=T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init( rand_values, rootPath + 'vocab_DQAAAA_glove_50d.txt') #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() index_D = T.lmatrix() # now, x is the index matrix, must be integer index_Q = T.lvector() index_A1 = T.lvector() index_A2 = T.lvector() index_A3 = T.lvector() index_A4 = T.lvector() # y = T.lvector() len_D = T.lscalar() len_D_s = T.lvector() len_Q = T.lscalar() len_A1 = T.lscalar() len_A2 = T.lscalar() len_A3 = T.lscalar() len_A4 = T.lscalar() left_D = T.lscalar() left_D_s = T.lvector() left_Q = T.lscalar() left_A1 = T.lscalar() left_A2 = T.lscalar() left_A3 = T.lscalar() left_A4 = T.lscalar() right_D = T.lscalar() right_D_s = T.lvector() right_Q = T.lscalar() right_A1 = T.lscalar() right_A2 = T.lscalar() right_A3 = T.lscalar() right_A4 = T.lscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # sentence shape dshape = (nkerns[0], maxDocLength) # doc shape filter_words = (emb_size, window_width) filter_sents = (nkerns[0], window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? # length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape( (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1), 'layer0_D_input') #.dimshuffle(0, 'x', 1, 2) layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape( (maxSentLength, emb_size)).transpose(), 'layer0_Q_input') #.dimshuffle(0, 'x', 1, 2) layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape( (maxSentLength, emb_size)).transpose(), 'layer0_A1_input') #.dimshuffle(0, 'x', 1, 2) layer0_A2_input = embeddings[index_A2.flatten()].reshape( (maxSentLength, emb_size)).transpose() #.dimshuffle(0, 'x', 1, 2) layer0_A3_input = embeddings[index_A3.flatten()].reshape( (maxSentLength, emb_size)).transpose() #.dimshuffle(0, 'x', 1, 2) layer0_A4_input = embeddings[index_A4.flatten()].reshape( (maxSentLength, emb_size)).transpose() #.dimshuffle(0, 'x', 1, 2) U, W, b, Ub, Wb, bb = create_Bi_GRU_para(rng, emb_size, nkerns[0]) layer0_para = [U, W, b, Ub, Wb, bb] # conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1])) # layer2_para=[conv2_W, conv2_b] # high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # highW_para=[high_W, high_b] #load_model(params) layer0_D = Bi_GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D, :, :], lefts=left_D_s[left_D:-right_D], rights=right_D_s[left_D:-right_D], hidden_dim=nkerns[0], U=U, W=W, b=b, Ub=Ub, Wb=Wb, bb=bb) layer0_Q = Bi_GRU_Matrix_Input(X=layer0_Q_input[:, left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A1 = Bi_GRU_Matrix_Input(X=layer0_A1_input[:, left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A2 = Bi_GRU_Matrix_Input(X=layer0_A2_input[:, left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A3 = Bi_GRU_Matrix_Input(X=layer0_A3_input[:, left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_A4 = Bi_GRU_Matrix_Input(X=layer0_A4_input[:, left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0], U=U, W=W, b=b, U_b=Ub, W_b=Wb, b_b=bb, bptt_truncate=-1) layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output') # hidden*2 layer0_Q_output = debug_print(layer0_Q.output_vector_last, 'layer0_Q.output') # hidden*4 layer0_A1_output = debug_print(layer0_A1.output_vector_last, 'layer0_A1.output') layer0_A2_output = debug_print(layer0_A2.output_vector_last, 'layer0_A2.output') layer0_A3_output = debug_print(layer0_A3.output_vector_last, 'layer0_A3.output') layer0_A4_output = debug_print(layer0_A4.output_vector_last, 'layer0_A4.output') #before reasoning, do a GRU for doc: d U_d, W_d, b_d, U_db, W_db, b_db = create_Bi_GRU_para( rng, nkerns[0] * 2, nkerns[0] * 2) layer_d_para = [U_d, W_d, b_d, U_db, W_db, b_db] layer_D_GRU = Bi_GRU_Matrix_Input(X=layer0_D_output, word_dim=nkerns[0] * 2, hidden_dim=nkerns[0] * 2, U=U_d, W=W_d, b=b_d, U_b=U_db, W_b=W_db, b_b=b_db, bptt_truncate=-1) #Reasoning Layer 1 repeat_Q = debug_print( T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0], 1)), maxDocLength, axis=1)[:, :layer_D_GRU.output_matrix.shape[1]], 'repeat_Q') input_DNN = debug_print( T.concatenate([layer_D_GRU.output_matrix, repeat_Q], axis=0).transpose(), 'input_DNN') #each row is an example output_DNN1 = HiddenLayer(rng, input=input_DNN, n_in=nkerns[0] * 8, n_out=nkerns[0]) attention_W = create_ensemble_para(rng, nkerns[0], 1) attention_weights = T.nnet.softmax( T.dot(attention_W, output_DNN1.output.transpose())) repeat_attentions = T.repeat(attention_weights, layer_D_GRU.output_matrix.shape[0], axis=0) doc_r = T.sum(layer_D_GRU.output_matrix * repeat_attentions, axis=1) combine_DQ = T.concatenate([doc_r, layer0_Q_output], axis=0) # dim: hidden*6 output_DNN2 = HiddenLayer(rng, input=combine_DQ, n_in=nkerns[0] * 8, n_out=nkerns[0] * 4) # DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out') # U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_pooling_para=[U_p, W_p, b_p] # pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1) # translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1') # # # #before reasoning, do a GRU for doc: d2 # U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_d2_para=[U_d2, W_d2, b_d2] # layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1) # #Reasoning Layer 2 # repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1') # input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example # output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0]) # output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0]) # # DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2') # U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0]) # layer_pooling_para2=[U_p2, W_p2, b_p2] # pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1) translated_Q2 = debug_print(output_DNN2.output, 'translated_Q2') QA1 = T.concatenate([translated_Q2, layer0_A1_output], axis=0) #dim: hidden*5 QA2 = T.concatenate([translated_Q2, layer0_A2_output], axis=0) QA3 = T.concatenate([translated_Q2, layer0_A3_output], axis=0) QA4 = T.concatenate([translated_Q2, layer0_A4_output], axis=0) W_HL, b_HL = create_HiddenLayer_para(rng, n_in=nkerns[0] * 8, n_out=1) match_params = [W_HL, b_HL] QA1_match = HiddenLayer(rng, input=QA1, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) QA2_match = HiddenLayer(rng, input=QA2, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) QA3_match = HiddenLayer(rng, input=QA3, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) QA4_match = HiddenLayer(rng, input=QA4, n_in=nkerns[0] * 8, n_out=1, W=W_HL, b=b_HL) # simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1') # simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2') # simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3') # simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4') simi_overall_level1 = debug_print(QA1_match.output[0], 'simi_overall_level1') simi_overall_level2 = debug_print(QA2_match.output[0], 'simi_overall_level2') simi_overall_level3 = debug_print(QA3_match.output[0], 'simi_overall_level3') simi_overall_level4 = debug_print(QA4_match.output[0], 'simi_overall_level4') # eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA)) #only use overall_simi cost = T.maximum( 0.0, margin + simi_overall_level2 - simi_overall_level1) + T.maximum( 0.0, margin + simi_overall_level3 - simi_overall_level1) + T.maximum( 0.0, margin + simi_overall_level4 - simi_overall_level1) # cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi) posi_simi = simi_overall_level1 nega_simi = T.max( [simi_overall_level2, simi_overall_level3, simi_overall_level4]) # #use ensembled simi # cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi) # posi_simi=simi_1 # nega_simi=T.max([simi_2, simi_3, simi_4]) L2_reg = debug_print( (U**2).sum() + (W**2).sum() + (Ub**2).sum() + (Wb**2).sum() + (output_DNN1.W**2).sum() + (output_DNN2.W**2).sum() + (U_d**2).sum() + (W_d**2).sum() + (U_db**2).sum() + (W_db**2).sum() + (W_HL**2).sum() + (attention_W**2).sum(), 'L2_reg' ) #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum() cost = debug_print(cost + L2_weight * L2_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: test_data_D[index], #a matrix index_Q: test_data_Q[index], index_A1: test_data_A1[index], index_A2: test_data_A2[index], index_A3: test_data_A3[index], index_A4: test_data_A4[index], len_D: test_Length_D[index], len_D_s: test_Length_D_s[index], len_Q: test_Length_Q[index], len_A1: test_Length_A1[index], len_A2: test_Length_A2[index], len_A3: test_Length_A3[index], len_A4: test_Length_A4[index], left_D: test_leftPad_D[index], left_D_s: test_leftPad_D_s[index], left_Q: test_leftPad_Q[index], left_A1: test_leftPad_A1[index], left_A2: test_leftPad_A2[index], left_A3: test_leftPad_A3[index], left_A4: test_leftPad_A4[index], right_D: test_rightPad_D[index], right_D_s: test_rightPad_D_s[index], right_Q: test_rightPad_Q[index], right_A1: test_rightPad_A1[index], right_A2: test_rightPad_A2[index], right_A3: test_rightPad_A3[index], right_A4: test_rightPad_A4[index] }, on_unused_input='ignore') params = layer0_para + output_DNN1.params + output_DNN2.params + match_params + layer_d_para + [ attention_W ] # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6))) # updates.append((acc_i, acc)) def AdaDelta_updates(parameters, gradients, rho, eps): # create variables to store intermediate updates gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] # calculates the new "average" delta for the next iteration gradients_sq_new = [ rho * g_sq + (1 - rho) * (g**2) for g_sq, g in zip(gradients_sq, gradients) ] # calculates the step in direction. The square root is an approximation to getting the RMS for the average value deltas = [ (T.sqrt(d_sq + eps) / T.sqrt(g_sq + eps)) * grad for d_sq, g_sq, grad in zip(deltas_sq, gradients_sq_new, gradients) ] # calculates the new "average" deltas for the next step. deltas_sq_new = [ rho * d_sq + (1 - rho) * (d**2) for d_sq, d in zip(deltas_sq, deltas) ] # Prepare it as a list f gradient_sq_updates = zip(gradients_sq, gradients_sq_new) deltas_sq_updates = zip(deltas_sq, deltas_sq_new) parameters_updates = [(p, p - d) for p, d in zip(parameters, deltas)] return gradient_sq_updates + deltas_sq_updates + parameters_updates updates = AdaDelta_updates(params, grads, decay, 1e-6) train_model = theano.function( [index], [cost, posi_simi, nega_simi], updates=updates, givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost, posi_simi, nega_simi], givens={ index_D: train_data_D[index], index_Q: train_data_Q[index], index_A1: train_data_A1[index], index_A2: train_data_A2[index], index_A3: train_data_A3[index], index_A4: train_data_A4[index], len_D: train_Length_D[index], len_D_s: train_Length_D_s[index], len_Q: train_Length_Q[index], len_A1: train_Length_A1[index], len_A2: train_Length_A2[index], len_A3: train_Length_A3[index], len_A4: train_Length_A4[index], left_D: train_leftPad_D[index], left_D_s: train_leftPad_D_s[index], left_Q: train_leftPad_Q[index], left_A1: train_leftPad_A1[index], left_A2: train_leftPad_A2[index], left_A3: train_leftPad_A3[index], left_A4: train_leftPad_A4[index], right_D: train_rightPad_D[index], right_D_s: train_rightPad_D_s[index], right_Q: train_rightPad_Q[index], right_A1: train_rightPad_A1[index], right_A2: train_rightPad_A2[index], right_A3: train_rightPad_A3[index], right_A4: train_rightPad_A4[index] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() mid_time = start_time epoch = 0 done_looping = False max_acc = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 # shuffle(train_batch_start)#shuffle training data corr_train = 0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 sys.stdout.write("Training :[%6f] %% complete!\r" % ((iter % train_size) * 100.0 / train_size)) sys.stdout.flush() minibatch_index = minibatch_index + 1 cost_average, posi_simi, nega_simi = train_model(batch_start) if posi_simi > nega_simi: corr_train += 1 if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + 'corr rate:' + str( corr_train * 100.0 / train_size) if iter % validation_frequency == 0: corr_test = 0 for i in test_batch_start: cost, posi_simi, nega_simi = test_model(i) if posi_simi > nega_simi: corr_test += 1 #write_file.close() #test_score = numpy.mean(test_losses) test_acc = corr_test * 1.0 / test_size #test_acc=1-test_score print( ('\t\t\tepoch %i, minibatch %i/%i, test acc of best ' 'model %f %%') % (epoch, minibatch_index, n_train_batches, test_acc * 100.)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') find_better = False if test_acc > max_acc: max_acc = test_acc best_epoch = epoch find_better = True print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')' if find_better == True: store_model_to_file(params, best_epoch, max_acc) print 'Finished storing best params' if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min' mid_time = time.clock() #writefile.close() #print 'Batch_size: ', update_freq end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, L2_weight=0.000001, drop_p=0.05, emb_size=300, hidden_size=500, HL_hidden_size=500, batch_size=5, filter_size=[3, 5, 7], maxSentLen=180, comment=''): model_options = locals().copy() print "model options", model_options rng = np.random.RandomState( 1234) #random seed, control the model generates the same results srng = RandomStreams(rng.randint(999999)) all_sentences, all_masks, all_labels, word2id = load_yelp_dataset( maxlen=maxSentLen, minlen=2 ) #minlen, include one label, at least one word in the sentence train_sents = np.asarray(all_sentences[0], dtype='int32') train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX) train_labels = np.asarray(all_labels[0], dtype='int32') train_size = len(train_labels) dev_sents = np.asarray(all_sentences[1], dtype='int32') dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX) dev_labels = np.asarray(all_labels[1], dtype='int32') dev_size = len(dev_labels) test_sents = np.asarray(all_sentences[2], dtype='int32') test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX) test_labels = np.asarray(all_labels[2], dtype='int32') test_size = len(test_labels) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec_file('glove.840B.300d.txt') # rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.ivector('labels') train_flag = T.iscalar() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM # drop_common_input = dropout_layer(srng, common_input, drop_p, train_flag) bow = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) #(batch, emb_size) gate_filter_shape = (emb_size, 1, emb_size, 1) conv_W_2_pre, conv_b_2_pre = create_conv_para( rng, filter_shape=gate_filter_shape) conv_W_2_gate, conv_b_2_gate = create_conv_para( rng, filter_shape=gate_filter_shape) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size, 1)) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[1])) conv_W2_context, conv_b2_context = create_conv_para( rng, filter_shape=(hidden_size, 1, emb_size, 1)) # conv_W3, conv_b3=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[2])) # conv_W3_context, conv_b3_context=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 1)) # conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) soft_att_W_big, soft_att_b_big = create_HiddenLayer_para( rng, emb_size * 2, emb_size) soft_att_W_small, _ = create_HiddenLayer_para(rng, emb_size, 1) soft_att_W2_big, soft_att_b2_big = create_HiddenLayer_para( rng, emb_size * 2, emb_size) soft_att_W2_small, _ = create_HiddenLayer_para(rng, emb_size, 1) # soft_att_W3_big, soft_att_b3_big = create_HiddenLayer_para(rng, emb_size*2, emb_size) # soft_att_W3_small, _ = create_HiddenLayer_para(rng, emb_size, 1) NN_para = [ conv_W_2_pre, conv_b_2_pre, conv_W_2_gate, conv_b_2_gate, conv_W, conv_b, conv_W_context, conv_W2, conv_b2, conv_W2_context, # conv_W3, conv_b3,conv_W3_context, soft_att_W_big, soft_att_b_big, soft_att_W_small, soft_att_W2_big, soft_att_b2_big, soft_att_W2_small # soft_att_W3_big, soft_att_b3_big,soft_att_W3_small ] #,conv_W3, conv_b3,conv_W3_context] conv_layer_1_gate_l = Conv_with_Mask_with_Gate( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_2_pre, b=conv_b_2_pre, W_gate=conv_W_2_gate, b_gate=conv_b_2_gate) advanced_sent_tensor3 = conv_layer_1_gate_l.output_tensor3 # conv_layer_pair = Conv_for_Pair(rng, # origin_input_tensor3=advanced_sent_tensor3, # origin_input_tensor3_r = advanced_sent_tensor3, # input_tensor3=advanced_sent_tensor3, # input_tensor3_r = advanced_sent_tensor3, # mask_matrix = sents_mask, # mask_matrix_r = sents_mask, # image_shape=(batch_size, 1, emb_size, maxSentLen), # image_shape_r = (batch_size, 1, emb_size, maxSentLen), # filter_shape=(hidden_size, 1, emb_size, filter_size[0]), # filter_shape_context=(hidden_size, 1, emb_size, 1), # W=conv_W, b=conv_b, # W_context=conv_W_context, b_context=conv_b_context) conv_layer_pair = Conv_for_Pair_SoftAttend( rng, origin_input_tensor3=advanced_sent_tensor3, origin_input_tensor3_r=advanced_sent_tensor3, input_tensor3=advanced_sent_tensor3, input_tensor3_r=advanced_sent_tensor3, mask_matrix=sents_mask, mask_matrix_r=sents_mask, filter_shape=(hidden_size, 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size, 1, emb_size, 1), image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context, soft_att_W_big=soft_att_W_big, soft_att_b_big=soft_att_b_big, soft_att_W_small=soft_att_W_small) # conv_layer_2_pair = Conv_for_Pair(rng, # origin_input_tensor3=advanced_sent_tensor3, # origin_input_tensor3_r = advanced_sent_tensor3, # input_tensor3=advanced_sent_tensor3, # input_tensor3_r = advanced_sent_tensor3, # mask_matrix = sents_mask, # mask_matrix_r = sents_mask, # image_shape=(batch_size, 1, emb_size, maxSentLen), # image_shape_r = (batch_size, 1, emb_size, maxSentLen), # filter_shape=(hidden_size, 1, emb_size, filter_size[1]), # filter_shape_context=(hidden_size, 1, emb_size, 1), # W=conv_W2, b=conv_b2, # W_context=conv_W2_context, b_context=conv_b2_context) conv_layer_2_pair = Conv_for_Pair_SoftAttend( rng, origin_input_tensor3=advanced_sent_tensor3, origin_input_tensor3_r=advanced_sent_tensor3, input_tensor3=advanced_sent_tensor3, input_tensor3_r=advanced_sent_tensor3, mask_matrix=sents_mask, mask_matrix_r=sents_mask, filter_shape=(hidden_size, 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size, 1, emb_size, 1), image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), W=conv_W2, b=conv_b2, W_context=conv_W2_context, b_context=conv_b2_context, soft_att_W_big=soft_att_W2_big, soft_att_b_big=soft_att_b2_big, soft_att_W_small=soft_att_W2_small) # conv_layer_3_pair = Conv_for_Pair_SoftAttend(rng, # origin_input_tensor3=advanced_sent_tensor3, # origin_input_tensor3_r=advanced_sent_tensor3, # input_tensor3=advanced_sent_tensor3, # input_tensor3_r=advanced_sent_tensor3, # mask_matrix=sents_mask, # mask_matrix_r=sents_mask, # filter_shape=(hidden_size, 1, emb_size, filter_size[2]), # filter_shape_context=(hidden_size, 1, emb_size, 1), # image_shape=(batch_size, 1, emb_size, maxSentLen), # image_shape_r= (batch_size, 1, emb_size, maxSentLen), # W=conv_W3, b=conv_b3, # W_context=conv_W3_context, b_context=conv_b3_context, # soft_att_W_big=soft_att_W3_big, soft_att_b_big=soft_att_b3_big, # soft_att_W_small=soft_att_W3_small) # biased_sent_embeddings = conv_layer_pair.biased_attentive_maxpool_vec_l sent_embeddings = conv_layer_pair.maxpool_vec_l att_sent_embeddings = conv_layer_pair.attentive_maxpool_vec_l sent_embeddings_2 = conv_layer_2_pair.maxpool_vec_l att_sent_embeddings_2 = conv_layer_2_pair.attentive_maxpool_vec_l #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative HL_input = T.concatenate( [ bow, sent_embeddings, att_sent_embeddings, sent_embeddings_2, att_sent_embeddings_2 # sent_embeddings_3,att_sent_embeddings_3, ], axis=1) HL_input_size = hidden_size * 4 + emb_size HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, HL_input_size, HL_hidden_size) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=HL_hidden_size, W=HL_layer_1_W, b=HL_layer_1_b, activation=T.nnet.relu) # HL_layer_1_output = dropout_layer(srng, HL_layer_1.output, drop_p, train_flag) HL_layer_2_W, HL_layer_2_b = create_HiddenLayer_para( rng, HL_hidden_size, HL_hidden_size) HL_layer_2_params = [HL_layer_2_W, HL_layer_2_b] HL_layer_2 = HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, W=HL_layer_2_W, b=HL_layer_2_b, activation=T.nnet.relu) # HL_layer_2_output = dropout_layer(srng, HL_layer_2.output, drop_p, train_flag) LR_input = T.concatenate([HL_input, HL_layer_1.output, HL_layer_2.output], axis=1) # drop_LR_input = dropout_layer(srng, LR_input, drop_p, train_flag) LR_input_size = HL_input_size + 2 * HL_hidden_size U_a = create_ensemble_para( rng, 5, LR_input_size) # the weight matrix hidden_size*2 # norm_W_a = normalize_matrix(U_a) LR_b = theano.shared(value=np.zeros((5, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=5, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [ embeddings ] + NN_para + HL_layer_1_params + HL_layer_2_params + LR_para # put all model parameters together L2_reg = L2norm_paraList([ embeddings, conv_W_2_pre, conv_W_2_gate, conv_W, conv_W_context, conv_W2, conv_W2_context, soft_att_W_big, soft_att_W_small, soft_att_W2_big, soft_att_W2_small, HL_layer_1_W, HL_layer_2_W, U_a ]) # diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix) cost = loss #+L2_weight*L2_reg grads = T.grad( cost, params) # create a list of gradients for all model parameters accumulator = [] for para_i in params: eps_p = np.zeros_like(para_i.get_value(borrow=True), dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) updates.append( (param_i, param_i - learning_rate * grad_i / (T.sqrt(acc) + 1e-8))) #1e-8 is add to get rid of zero division updates.append((acc_i, acc)) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function( [sents_id_matrix, sents_mask, labels, train_flag], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_id_matrix, sents_mask, labels, train_flag], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, labels, train_flag], [layer_LR.errors(labels), layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 # combined = zip(train_sents, train_masks, train_labels) random.Random(200).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], 1) #after each 1000 batches, we test the performance of the model on all test data if iter % 2000 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() error_sum = 0.0 # writefile=open('log.'+nn+'.senti.preditions.txt', 'w') for test_batch_id in test_batch_start: # for each test batch error_i, pred_labels = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], test_labels[test_batch_id:test_batch_id + batch_size], 0) # pred_labels=list(pred_labels) # if test_batch_id !=test_batch_start[-1]: # writefile.write('\n'.join(map(str,pred_labels))+'\n') # else: # writefile.write('\n'.join(map(str,pred_labels[-test_size%batch_size:]))) error_sum += error_i # writefile.close() test_accuracy = 1.0 - error_sum / (len(test_batch_start)) if test_accuracy > max_acc_test: max_acc_test = test_accuracy print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test