def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=0.0000001, extra_size=4, emb_size=300, batch_size=50, filter_size=[3, 5], maxSentLen=60, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id = load_NYT_dataset( maxSentLen, word2id) train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(test_sents_l, dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(test_sents_r, dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(test_labels, dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' def embed_input(emb_matrix, sent_ids): return emb_matrix[sent_ids.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) embed_input_l = embed_input( init_embeddings, sents_ids_l ) #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = embed_input( init_embeddings, sents_ids_r ) #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) '''create_AttentiveConv_params ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1)) conv_W_2, conv_b_2 = create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[1])) conv_W_context_2, conv_b_context_2 = create_conv_para( rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1)) NN_para = [ conv_W, conv_b, conv_W_context, conv_W_2, conv_b_2, conv_W_context_2 ] ''' attentive convolution function ''' attentive_conv_layer = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]), filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r attentive_conv_layer_2 = Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, hidden_size[0], maxSentLen), image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen), filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[1]), filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1), W=conv_W_2, b=conv_b_2, W_context=conv_W_context_2, b_context=conv_b_context_2) attentive_sent_embeddings_l_2 = attentive_conv_layer_2.attentive_maxpool_vec_l attentive_sent_embeddings_r_2 = attentive_conv_layer_2.attentive_maxpool_vec_r #form input to HL layers HL_layer_1_input = T.concatenate([ attentive_sent_embeddings_l, attentive_sent_embeddings_r, attentive_sent_embeddings_l * attentive_sent_embeddings_r, attentive_sent_embeddings_l_2, attentive_sent_embeddings_r_2, attentive_sent_embeddings_l_2 * attentive_sent_embeddings_r_2 ], axis=1) HL_layer_1_input_size = 6 * hidden_size[1] HL_layer_1 = HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[1], activation=T.nnet.relu) HL_layer_2 = HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[1], n_out=hidden_size[1], activation=T.nnet.relu) # LR_input_size=HL_layer_1_input_size+2*hidden_size[0] "form input to LR classifier" LR_input = T.tanh( T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output], axis=1)) LR_input_size = HL_layer_1_input_size + 2 * hidden_size[1] U_a = create_ensemble_para( rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' test_preds = T.argmax(layer_LR.p_y_given_x, axis=1) transfered_preds = T.eq(test_preds, 2) test_error = T.mean(T.neq(transfered_preds, labels)) params = [init_embeddings ] + NN_para + HL_layer_1.params + HL_layer_2.params + LR_para cost = loss updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], [test_error, transfered_preds], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 max_f1 = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_labels = [] gold_labels = [] error_sum = 0.0 for idd, test_batch_id in enumerate( test_batch_start): # for each test batch error_i, pred_labels_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i pred_labels += list(pred_labels_i) gold_labels += list( test_labels_store[test_batch_id:test_batch_id + batch_size]) test_acc = 1.0 - error_sum / (len(test_batch_start)) test_f1 = f1_score_2_binary_list( gold_labels, pred_labels) #, average='binary') if test_acc > max_acc_test: max_acc_test = test_acc if test_f1 > max_f1: max_f1 = test_f1 # store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_five_copies_'+str(max_acc_test), params) print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test, '\t\t test_f1:', test_f1, '\t\tmax F1:', max_f1 print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=70, filter_size=[3, 1], maxSentLen=70, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(all_sentences_l[2], dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(all_sentences_r[2], dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(all_labels[2], dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered" "words keep random initialization" rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' 'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in' 'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) ' embed_input_l = init_embeddings[sents_ids_l.flatten( )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) '''create parameters for attentive convolution function ''' gate_filter_shape = (emb_size, 1, emb_size, 1) conv_W_pre, conv_b_pre = create_conv_para(rng, filter_shape=gate_filter_shape) conv_W_gate, conv_b_gate = create_conv_para(rng, filter_shape=gate_filter_shape) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W2_context, conv_b2_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [ conv_W, conv_b, conv_W_context, conv_W_pre, conv_b_pre, conv_W_gate, conv_b_gate, conv_W2, conv_b2, conv_W2_context ] "A gated convolution layer to form more expressive word representations in each sentence" "input tensor3 (batch_size, emb_size, sen_length), output tensor3 (batch_size, emb_size, sen_length)" conv_layer_gate_l = Conv_with_Mask_with_Gate( rng, input_tensor3=embed_input_l, mask_matrix=sents_mask_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_pre, b=conv_b_pre, W_gate=conv_W_gate, b_gate=conv_b_gate) conv_layer_gate_r = Conv_with_Mask_with_Gate( rng, input_tensor3=embed_input_r, mask_matrix=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=gate_filter_shape, W=conv_W_pre, b=conv_b_pre, W_gate=conv_W_gate, b_gate=conv_b_gate) ''' attentive convolution function, two sizes of filter_width 3&1 are used. Multi-channel ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=conv_layer_gate_l.output_tensor3, input_tensor3_r=conv_layer_gate_r.output_tensor3, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=embed_input_l, origin_input_tensor3_r=embed_input_r, input_tensor3=conv_layer_gate_l.output_tensor3, input_tensor3_r=conv_layer_gate_r.output_tensor3, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W2, b=conv_b2, W_context=conv_W2_context, b_context=conv_b2_context) attentive_sent_embeddings_l2 = attentive_conv_layer2.attentive_maxpool_vec_l attentive_sent_embeddings_r2 = attentive_conv_layer2.attentive_maxpool_vec_r "Batch normalization for the four output sentence representation vectors" gamma = theano.shared(np.asarray(rng.uniform( low=-1.0 / math.sqrt(hidden_size[0]), high=1.0 / math.sqrt(hidden_size[0]), size=(hidden_size[0])), dtype=theano.config.floatX), borrow=True) beta = theano.shared(np.zeros((hidden_size[0]), dtype=theano.config.floatX), borrow=True) bn_params = [gamma, beta] bn_attentive_sent_embeddings_l = batch_normalization( inputs=attentive_sent_embeddings_l, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_l.mean((0, ), keepdims=True), std=attentive_sent_embeddings_l.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_r = batch_normalization( inputs=attentive_sent_embeddings_r, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_r.mean((0, ), keepdims=True), std=attentive_sent_embeddings_r.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_l2 = batch_normalization( inputs=attentive_sent_embeddings_l2, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_l2.mean((0, ), keepdims=True), std=attentive_sent_embeddings_l2.std((0, ), keepdims=True), mode='low_mem') bn_attentive_sent_embeddings_r2 = batch_normalization( inputs=attentive_sent_embeddings_r2, gamma=gamma, beta=beta, mean=attentive_sent_embeddings_r2.mean((0, ), keepdims=True), std=attentive_sent_embeddings_r2.std((0, ), keepdims=True), mode='low_mem') "Before logistic regression layer, we insert a hidden layer. Now form input to HL classifier" HL_layer_1_input = T.concatenate([ bn_attentive_sent_embeddings_l, bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l + bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l * bn_attentive_sent_embeddings_r, bn_attentive_sent_embeddings_l2, bn_attentive_sent_embeddings_r2, bn_attentive_sent_embeddings_l2 + bn_attentive_sent_embeddings_r2, bn_attentive_sent_embeddings_l2 * bn_attentive_sent_embeddings_r2 ], axis=1) HL_layer_1_input_size = 8 * hidden_size[0] "Create hidden layer parameters" HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, HL_layer_1_input_size, hidden_size[1]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] "Hidden Layer and batch norm to its output again" HL_layer_1 = HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[1], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) gamma_HL = theano.shared(np.asarray(rng.uniform( low=-1.0 / math.sqrt(hidden_size[1]), high=1.0 / math.sqrt(hidden_size[1]), size=(hidden_size[1])), dtype=theano.config.floatX), borrow=True) beta_HL = theano.shared(np.zeros((hidden_size[1]), dtype=theano.config.floatX), borrow=True) bn_params_HL = [gamma_HL, beta_HL] bn_HL_output = batch_normalization(inputs=HL_layer_1.output, gamma=gamma_HL, beta=beta_HL, mean=HL_layer_1.output.mean( (0, ), keepdims=True), std=HL_layer_1.output.std( (0, ), keepdims=True), mode='low_mem') "Form input to LR classifier" LR_input = T.concatenate([HL_layer_1_input, bn_HL_output], axis=1) LR_input_size = HL_layer_1_input_size + hidden_size[1] U_a = create_ensemble_para(rng, 3, LR_input_size) # (input_size, 3) LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] "Logistic Regression layer" layer_LR = LogisticRegression( rng, input=normalize_matrix_col_wise(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [ init_embeddings ] + NN_para + LR_para + bn_params + HL_layer_1_params + bn_params_HL cost = loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) if (epoch == 1 and iter % 1000 == 0) or (epoch >= 2 and iter % 5 == 0): print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() dev_error_sum = 0.0 for dev_batch_id in dev_batch_start: # for each test batch dev_error_i = dev_model( dev_sents_l[dev_batch_id:dev_batch_id + batch_size], dev_masks_l[dev_batch_id:dev_batch_id + batch_size], dev_sents_r[dev_batch_id:dev_batch_id + batch_size], dev_masks_r[dev_batch_id:dev_batch_id + batch_size], dev_labels_store[dev_batch_id:dev_batch_id + batch_size]) dev_error_sum += dev_error_i dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start)) if dev_acc > max_acc_dev: max_acc_dev = dev_acc print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev ''' best dev model, test ''' error_sum = 0.0 for test_batch_id in test_batch_start: # for each test batch error_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i test_acc = 1.0 - error_sum / (len(test_batch_start)) if test_acc > max_acc_test: max_acc_test = test_acc print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test else: print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=70, filter_size=[3], maxSentLen=40, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset( maxlen=maxSentLen ) #minlen, include one label, at least one word in the sentence train_sents_l = np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32') test_sents_l = np.asarray(all_sentences_l[2], dtype='int32') train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r = np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32') test_sents_r = np.asarray(all_sentences_r[2], dtype='int32') train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store = np.asarray(all_labels[0], dtype='int32') dev_labels_store = np.asarray(all_labels[1], dtype='int32') test_labels_store = np.asarray(all_labels[2], dtype='int32') train_size = len(train_labels_store) dev_size = len(dev_labels_store) test_size = len(test_labels_store) print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size vocab_size = len(word2id) + 1 "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered" "words keep random initialization" rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids_l = T.imatrix() sents_mask_l = T.fmatrix() sents_ids_r = T.imatrix() sents_mask_r = T.fmatrix() labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' 'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in' 'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) ' embed_input_l = init_embeddings[sents_ids_l.flatten( )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1) '''create parameters for attentive convolution function ''' conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [conv_W, conv_b, conv_W_context] ''' attentive convolution function ''' attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3=embed_input_l, input_tensor3_r=embed_input_r, mask_matrix=sents_mask_l, mask_matrix_r=sents_mask_r, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_W, b=conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r "Logistic Regression layer" LR_input = T.concatenate([ attentive_sent_embeddings_l, attentive_sent_embeddings_r, attentive_sent_embeddings_l + attentive_sent_embeddings_r, attentive_sent_embeddings_l * attentive_sent_embeddings_r ], axis=1) LR_input_size = 4 * hidden_size[0] U_a = create_ensemble_para(rng, 3, LR_input_size) # (input_size, 3) LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class LR_para = [U_a, LR_b] layer_LR = LogisticRegression( rng, input=normalize_matrix_col_wise(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b ) #basically it is a multiplication between weight matrix and input feature vector loss = layer_LR.negative_log_likelihood( labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [init_embeddings] + NN_para + LR_para cost = loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_dev_batches = dev_size / batch_size dev_batch_start = list( np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] max_acc_dev = 0.0 max_acc_test = 0.0 cost_i = 0.0 train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model(train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0): if iter % 1000 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() ''' test ''' error_sum = 0.0 for test_batch_id in test_batch_start: # for each test batch error_i = test_model( test_sents_l[test_batch_id:test_batch_id + batch_size], test_masks_l[test_batch_id:test_batch_id + batch_size], test_sents_r[test_batch_id:test_batch_id + batch_size], test_masks_r[test_batch_id:test_batch_id + batch_size], test_labels_store[test_batch_id:test_batch_id + batch_size]) error_sum += error_i test_acc = 1.0 - error_sum / (len(test_batch_start)) if test_acc > max_acc_test: max_acc_test = test_acc print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def evaluate_lenet5(learning_rate=0.1, n_epochs=4, L2_weight=0.001, emb_size=70, batch_size=50, filter_size=3, maxSentLen=50, nn='CNN'): hidden_size=emb_size model_options = locals().copy() print "model options", model_options rng = np.random.RandomState(1234) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r,all_labels, word2id =load_SNLI_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_sents_l=np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') test_sents_l=np.asarray(all_sentences_l[2], dtype='int32') train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r=np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') test_sents_r=np.asarray(all_sentences_r[2] , dtype='int32') train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store=np.asarray(all_labels[0], dtype='int32') dev_labels_store=np.asarray(all_labels[1], dtype='int32') test_labels_store=np.asarray(all_labels[2], dtype='int32') train_size=len(train_labels_store) dev_size=len(dev_labels_store) test_size=len(test_labels_store) vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input_l=embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_r=embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)) #conv if nn=='CNN': conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size)) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) NN_para=[conv_W, conv_b] conv_input_l = common_input_l.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_l = Conv_with_input_para(rng, input=conv_input_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_l=conv_model_l.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_l=conv_output_l.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_l=T.repeat(sents_mask_l[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_l=(1.0-mask_for_conv_output_l)*(mask_for_conv_output_l-10) masked_conv_output_l=conv_output_into_tensor3_l+mask_for_conv_output_l #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_l=T.max(masked_conv_output_l, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_input_r = common_input_r.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_r = Conv_with_input_para(rng, input=conv_input_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_r=conv_model_r.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_r=conv_output_r.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_r=T.repeat(sents_mask_r[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_r=(1.0-mask_for_conv_output_r)*(mask_for_conv_output_r-10) masked_conv_output_r=conv_output_into_tensor3_r+mask_for_conv_output_r #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_r=T.max(masked_conv_output_r, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size #GRU if nn=='GRU': U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) NN_para=[U1, W1, b1] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias gru_input_l = common_input_l.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_l=GRU_Batch_Tensor_Input_with_Mask(gru_input_l, sents_mask_l, hidden_size, U1, W1, b1) sent_embeddings_l=gru_layer_l.output_sent_rep # (batch_size, hidden_size) gru_input_r = common_input_r.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_r=GRU_Batch_Tensor_Input_with_Mask(gru_input_r, sents_mask_r, hidden_size, U1, W1, b1) sent_embeddings_r=gru_layer_r.output_sent_rep # (batch_size, hidden_size) #LSTM if nn=='LSTM': LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) NN_para=LSTM_para_dict.values() # .values returns a list of parameters lstm_input_l = common_input_l.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_l=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_l, sents_mask_l, hidden_size, LSTM_para_dict) sent_embeddings_l=lstm_layer_l.output_sent_rep # (batch_size, hidden_size) lstm_input_r = common_input_r.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_r=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_r, sents_mask_r, hidden_size, LSTM_para_dict) sent_embeddings_r=lstm_layer_r.output_sent_rep # (batch_size, hidden_size) HL_layer_1_input = T.concatenate([sent_embeddings_l,sent_embeddings_r, sent_embeddings_l*sent_embeddings_r, cosine_matrix1_matrix2_rowwise(sent_embeddings_l,sent_embeddings_r).dimshuffle(0,'x')],axis=1) HL_layer_1_input_size = hidden_size*3+1 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size, n_out=hidden_size, activation=T.tanh) #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative LR_input_size=HL_layer_1_input_size+2*hidden_size U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] LR_input=T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1) layer_LR=LogisticRegression(rng, input=T.tanh(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [embeddings]+NN_para+LR_para+HL_layer_1.params+HL_layer_2.params # put all model parameters together # L2_reg =L2norm_paraList([embeddings,conv_W, U_a]) # diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix) cost=loss#+Div_reg*diversify_reg#+L2_weight*L2_reg grads = T.grad(cost, params) # create a list of gradients for all model parameters accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #1e-8 is add to get rid of zero division updates.append((acc_i, acc)) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_train_batches=train_size/batch_size train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_dev_batches=dev_size/batch_size dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches=test_size/batch_size test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_acc_dev=0.0 max_acc_test=0.0 while epoch < n_epochs: epoch = epoch + 1 train_indices = range(train_size) random.Random(200).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu=0 cost_i=0.0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_indices[batch_id:batch_id+batch_size] cost_i+= train_model( train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter%500==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0: # print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' # past_time = time.time() error_sum=0.0 for dev_batch_id in dev_batch_start: # for each test batch error_i=dev_model( dev_sents_l[dev_batch_id:dev_batch_id+batch_size], dev_masks_l[dev_batch_id:dev_batch_id+batch_size], dev_sents_r[dev_batch_id:dev_batch_id+batch_size], dev_masks_r[dev_batch_id:dev_batch_id+batch_size], dev_labels_store[dev_batch_id:dev_batch_id+batch_size] ) error_sum+=error_i dev_accuracy=1.0-error_sum/(len(dev_batch_start)) if dev_accuracy > max_acc_dev: max_acc_dev=dev_accuracy print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev #best dev model, do test error_sum=0.0 for test_batch_id in test_batch_start: # for each test batch error_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_labels_store[test_batch_id:test_batch_id+batch_size] ) error_sum+=error_i test_accuracy=1.0-error_sum/(len(test_batch_start)) if test_accuracy > max_acc_test: max_acc_test=test_accuracy print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test else: print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test