コード例 #1
0
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[50,50], batch_size=10, window_width=3,
                    maxSentLength=64, emb_size=50, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=33):# max_truncate can be 45
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/';
    rng = numpy.random.RandomState(23455)
#     datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'train.txt', rootPath+'test.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    datasets, vocab_size=load_SICK_corpus(rootPath+'vocab.txt', rootPath+'train_plus_dev.txt', rootPath+'test.txt', max_truncate,maxSentLength, entailment=True)
    mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    
    
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad = datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    '''
    indices_train_l=T.cast(indices_train_l, 'int32')
    indices_train_r=T.cast(indices_train_r, 'int32')
    indices_test_l=T.cast(indices_test_l, 'int32')
    indices_test_r=T.cast(indices_test_r, 'int32')
    '''


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_glove_50d.txt')
#     rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    

    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lvector()
    right_l=T.lvector()
    left_r=T.lvector()
    right_r=T.lvector()
    length_l=T.lvector()
    length_r=T.lvector()
    norm_length_l=T.dvector()
    norm_length_r=T.dvector()
    mts=T.dmatrix()
    extra=T.dmatrix()
    discri=T.dmatrix()
    cost_tmp=T.dscalar()




#     #GPU
#     index = T.iscalar()
#     x_index_l = T.imatrix('x_index_l')   # now, x is the index matrix, must be integer
#     x_index_r = T.imatrix('x_index_r')
#     y = T.ivector('y')  
#     left_l=T.iscalar()
#     right_l=T.iscalar()
#     left_r=T.iscalar()
#     right_r=T.iscalar()
#     length_l=T.iscalar()
#     length_r=T.iscalar()
#     norm_length_l=T.fscalar()
#     norm_length_r=T.fscalar()
#     #mts=T.dmatrix()
#     #wmf=T.dmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = debug_print(embeddings[x_index_l.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_l_input')
    layer0_r_input = debug_print(embeddings[x_index_r.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_r_input')
    #paras:
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b]     
    U1, W1, b1=create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para=[U1, W1, b1] 
    def loop (l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i):   
        l_input_tensor=debug_print(Matrix_Bit_Shift(l_matrix[:,l_left:-l_right]), 'l_input_tensor')
        r_input_tensor=debug_print(Matrix_Bit_Shift(r_matrix[:,r_left:-r_right]), 'r_input_tensor')
        
        addition_l=T.sum(l_matrix[:,l_left:-l_right], axis=1)
        addition_r=T.sum(r_matrix[:,r_left:-r_right], axis=1)
        cosine_addition=cosine(addition_l, addition_r)
        eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2%
        
        layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        
        cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
        eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2%
        
        attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2)
        
        l_max_attention=T.max(attention_matrix, axis=1)
        neighborsArgSorted = T.argsort(l_max_attention)
        kNeighborsArg = neighborsArgSorted[:3]#only average the min 3 vectors
        ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie
    
    
        r_max_attention=T.max(attention_matrix, axis=0)
        neighborsArgSorted_r = T.argsort(r_max_attention)
        kNeighborsArg_r = neighborsArgSorted_r[:3]#only average the min 3 vectors
        rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie
    
        
        l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention')
        r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention')
        

    
        layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
        layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
    
        vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l')
        vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r')
    
        
        
    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #     
        uni_cosine=cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
        eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #     
        len_l=norm_length_l_i.reshape((1,1))
        len_r=norm_length_r_i.reshape((1,1))  
    #     
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))  
    #     '''
        #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
        #length_gap=T.sqrt((len_l-len_r)**2)
        #layer3_input=mts
#         layer3_input_nn=T.concatenate([vec_l, vec_r,
#                                     cosine_addition, eucli_addition,
#     #                                 cosine_sent, eucli_sent,
#                                     uni_cosine,eucli_1], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
        
        output_i=T.concatenate([vec_l, vec_r,
                                    cosine_addition, eucli_addition,
    #                                 cosine_sent, eucli_sent,
                                    uni_cosine,eucli_1,
                                    mts_i.reshape((1,14)),
                                    len_l, len_r,
                                    extra_i.reshape((1,9))], axis=1)#, layer2.output, layer1.output_cosine], axis=1)    
        return output_i
    
    layer3_input, _ = theano.scan(fn=loop,
                            sequences=[left_l, right_l, layer0_l_input, left_r, right_r, layer0_r_input, mts, extra, norm_length_l, norm_length_r],
                            outputs_info=None,#[self.h0, None],
                            n_steps=batch_size)       
#l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i
#     x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
#     x_index_r = T.lmatrix('x_index_r')
#     y = T.lvector('y')  
#     left_l=T.lvector()
#     right_l=T.lvector()
#     left_r=T.lvector()
#     right_r=T.lvector()
#     length_l=T.lvector()
#     length_r=T.lvector()
#     norm_length_l=T.dvector()
#     norm_length_r=T.dvector()
#     mts=T.dmatrix()
#     extra=T.dmatrix()
#     discri=T.dmatrix()
#     cost_tmp=T.dscalar()

    
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    feature_size=2*nkerns[1]+2+2+14+2+9
    layer3_input=layer3_input.reshape((batch_size, feature_size))
    layer3=LogisticRegression(rng, input=layer3_input, n_in=feature_size, n_out=3)

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(U** 2).sum()+(W** 2).sum()+(U1** 2).sum()+(W1** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y),layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index: index + batch_size],
            right_l: testRightPad_l[index: index + batch_size],
            left_r: testLeftPad_r[index: index + batch_size],
            right_r: testRightPad_r[index: index + batch_size],
            length_l: testLengths_l[index: index + batch_size],
            length_r: testLengths_r[index: index + batch_size],
            norm_length_l: normalized_test_length_l[index: index + batch_size],
            norm_length_r: normalized_test_length_r[index: index + batch_size],
            mts: mt_test[index: index + batch_size],
            extra: extra_test[index: index + batch_size],
            discri:discri_test[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ layer1_para+layer0_para#+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
#       
#     # create a list of gradients for all model parameters
#     grads = T.grad(cost, params)
# 
#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))    

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
    
    updates=Adam(cost=cost, params=params, lr=learning_rate)
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index: index + batch_size],
            right_l: trainRightPad_l[index: index + batch_size],
            left_r: trainLeftPad_r[index: index + batch_size],
            right_r: trainRightPad_r[index: index + batch_size],
            length_l: trainLengths_l[index: index + batch_size],
            length_r: trainLengths_r[index: index + batch_size],
            norm_length_l: normalized_train_length_l[index: index + batch_size],
            norm_length_r: normalized_train_length_r[index: index + batch_size],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)

    train_model_predict = theano.function([index, cost_tmp], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index: index + batch_size],
            right_l: trainRightPad_l[index: index + batch_size],
            left_r: trainLeftPad_r[index: index + batch_size],
            right_r: trainRightPad_r[index: index + batch_size],
            length_l: trainLengths_l[index: index + batch_size],
            length_r: trainLengths_r[index: index + batch_size],
            norm_length_l: normalized_train_length_l[index: index + batch_size],
            norm_length_r: normalized_train_length_r[index: index + batch_size],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False
    
    acc_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
#         shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
#             if (batch_start+1)%1000==0:
#                 print batch_start+1,  'uses ', (time.time()-mid_time)/60.0, 'min'
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y)
                    test_features.append(layer3_input)
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+
 
                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_features=numpy.concatenate(test_features, axis=0)
                test_y=numpy.concatenate(test_y, axis=0)
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           (1-test_score) * 100.))
                acc_nn=1-test_score
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 
                #this step is risky: if the training data is too big, then this step will make the training time twice longer
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0)
                    train_y.append(y)
                    train_features.append(layer3_input)
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1
 
                train_features=numpy.concatenate(train_features, axis=0)
                train_y=numpy.concatenate(train_y, axis=0)
 
                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                lr=linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                corr_count=0
                corr_count_lr=0
                test_size=len(test_y)
                for i in range(test_size):
                    if results[i]==test_y[i]:
                        corr_count+=1
                    if results_lr[i]==test_y[i]:
                        corr_count_lr+=1
                acc_svm=corr_count*1.0/test_size
                acc_lr=corr_count_lr*1.0/test_size
                if acc_svm > acc_max:
                    acc_max=acc_svm
                    best_epoch=epoch
                if acc_lr > acc_max:
                    acc_max=acc_lr
                    best_epoch=epoch
                if acc_nn > acc_max:
                    acc_max=acc_nn
                    best_epoch=epoch
                print  'acc_nn:', acc_nn, 'acc_lr:', acc_lr, 'acc_svm:', acc_svm, ' max acc: ',    acc_max , ' at epoch: ', best_epoch  

            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #2
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    nkerns=[256, 256],
                    batch_size=1,
                    window_width=3,
                    maxSentLength=64,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0006,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=33):  # max_truncate can be 45
    maxSentLength = max_truncate + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SICK/'
    rng = numpy.random.RandomState(23455)
    #     datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'train.txt', rootPath+'test.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    datasets, vocab_size = load_SICK_corpus(rootPath + 'vocab.txt',
                                            rootPath + 'train_plus_dev.txt',
                                            rootPath + 'test.txt',
                                            max_truncate,
                                            maxSentLength,
                                            entailment=True)
    mt_train, mt_test = load_mts_wikiQA(
        rootPath + 'Train_plus_dev_MT/concate_14mt_train.txt',
        rootPath + 'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test = load_extra_features(
        rootPath +
        'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt',
        rootPath +
        'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt'
    )
    discri_train, discri_test = load_extra_features(
        rootPath + 'train_plus_dev_discri_features_0.3.txt',
        rootPath + 'test_discri_features_0.3.txt')

    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]

    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int64')
    indices_train_r = T.cast(indices_train_r, 'int64')
    indices_test_l = T.cast(indices_test_l, 'int64')
    indices_test_r = T.cast(indices_test_r, 'int64')
    '''
    indices_train_l=T.cast(indices_train_l, 'int32')
    indices_train_r=T.cast(indices_train_r, 'int32')
    indices_test_l=T.cast(indices_test_l, 'int32')
    indices_test_r=T.cast(indices_test_r, 'int32')
    '''

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    #     rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_glove_50d.txt')
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_glove_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')
    left_l = T.lscalar()
    right_l = T.lscalar()
    left_r = T.lscalar()
    right_r = T.lscalar()
    length_l = T.lscalar()
    length_r = T.lscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    extra = T.dmatrix()
    discri = T.dmatrix()
    cost_tmp = T.dscalar()

    #     #GPU
    #     index = T.iscalar()
    #     x_index_l = T.imatrix('x_index_l')   # now, x is the index matrix, must be integer
    #     x_index_r = T.imatrix('x_index_r')
    #     y = T.ivector('y')
    #     left_l=T.iscalar()
    #     right_l=T.iscalar()
    #     left_r=T.iscalar()
    #     right_r=T.iscalar()
    #     length_l=T.iscalar()
    #     length_r=T.iscalar()
    #     norm_length_l=T.fscalar()
    #     norm_length_r=T.fscalar()
    #     #mts=T.dmatrix()
    #     #wmf=T.dmatrix()
    #     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = debug_print(
        embeddings[x_index_l.flatten()].reshape(
            (maxSentLength, emb_size)).transpose(), 'layer0_l_input')
    layer0_r_input = debug_print(
        embeddings[x_index_r.flatten()].reshape(
            (maxSentLength, emb_size)).transpose(), 'layer0_r_input')

    l_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_l_input[:, left_l:-right_l]), 'l_input_tensor')
    r_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_r_input[:, left_r:-right_r]), 'r_input_tensor')

    addition_l = T.sum(layer0_l_input[:, left_l:-right_l], axis=1)
    addition_r = T.sum(layer0_r_input[:, left_r:-right_r], axis=1)
    cosine_addition = cosine(addition_l, addition_r)
    eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r))  #25.2%

    U, W, b = create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para = [U, W, b]

    layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)
    layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)

    cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
    eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep,
                                     layer0_A2.output_sent_rep))  #25.2%

    #ibm attentive pooling at extended sentence level
    attention_matrix = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim,
        layer0_A2.dim,
        maxSentLength * (maxSentLength + 1) / 2)
    #     attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose()
    #     ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose()
    #     attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose()
    #     ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose()
    #     cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended)
    #     eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2%

    #ibm attentive pooling at original sentence level
    simi_matrix_sent = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates,
        length_l, length_r, maxSentLength)
    attention_vec_l = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=1)).transpose()
    ibm_l = layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose()
    attention_vec_r = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=0)).transpose()
    ibm_r = layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose()
    cosine_ibm = cosine(ibm_l, ibm_r)
    eucli_ibm = 1.0 / (1.0 + EUCLID(ibm_l, ibm_r))  #25.2%

    l_max_attention = T.max(attention_matrix, axis=1)
    neighborsArgSorted = T.argsort(l_max_attention)
    kNeighborsArg = neighborsArgSorted[:3]  #only average the min 3 vectors
    ll = T.sort(kNeighborsArg).flatten()  # make y indices in acending lie

    r_max_attention = T.max(attention_matrix, axis=0)
    neighborsArgSorted_r = T.argsort(r_max_attention)
    kNeighborsArg_r = neighborsArgSorted_r[:3]  #only average the min 3 vectors
    rr = T.sort(kNeighborsArg_r).flatten()  # make y indices in acending lie

    l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll],
                                      'l_max_min_attention')
    r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr],
                                      'r_max_min_attention')

    U1, W1, b1 = create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para = [U1, W1, b1]

    layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)
    layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)

    vec_l = debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])),
                        'vec_l')
    vec_r = debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])),
                        'vec_r')

    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #
    uni_cosine = cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
    eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r))  #25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #
    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    #
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))
    #     '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input = T.concatenate(
        [
            vec_l,
            vec_r,
            uni_cosine,
            eucli_1,
            cosine_addition,
            eucli_addition,
            #                                 cosine_sent, eucli_sent,
            ibm_l.reshape((1, nkerns[0])),
            ibm_r.reshape((1, nkerns[0])),  #2*nkerns[0]+
            cosine_ibm,
            eucli_ibm,

            #                                 ibm_l_extended.reshape((1, nkerns[0])), ibm_r_extended.reshape((1, nkerns[0])), #2*nkerns[0]+
            #                                 cosine_ibm_extended, eucli_ibm_extended,
            mts,
            len_l,
            len_r,
            extra
        ],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)

    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3 = LogisticRegression(rng,
                                input=layer3_input,
                                n_in=(2 * nkerns[1] + 2) + 2 +
                                (2 * nkerns[0] + 2) + 14 + 2 + 9,
                                n_out=3)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer3.W**2).sum() + (U**2).sum() + (W**2).sum() + (U1**2).sum() +
        (W1**2).sum(), 'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this = debug_print(layer3.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index], [layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size],
            extra: extra_test[index:index + batch_size],
            discri: discri_test[index:index + batch_size]
        },
        on_unused_input='ignore',
        allow_input_downcast=True)

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params + layer1_para + layer0_para  #+[embeddings]# + layer1.params
    #     params_conv = [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))


#     def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
#         updates = []
#         grads = T.grad(cost, params)
#         i = theano.shared(numpy.float64(0.))
#         i_t = i + 1.
#         fix1 = 1. - (1. - b1)**i_t
#         fix2 = 1. - (1. - b2)**i_t
#         lr_t = lr * (T.sqrt(fix2) / fix1)
#         for p, g in zip(params, grads):
#             m = theano.shared(p.get_value() * 0.)
#             v = theano.shared(p.get_value() * 0.)
#             m_t = (b1 * g) + ((1. - b1) * m)
#             v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
#             g_t = m_t / (T.sqrt(v_t) + e)
#             p_t = p - (lr_t * g_t)
#             updates.append((m, m_t))
#             updates.append((v, v_t))
#             updates.append((p, p_t))
#         updates.append((i, i_t))
#         return updates
#
#     updates=Adam(cost=cost, params=params, lr=learning_rate)

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            extra: extra_train[index:index + batch_size],
            discri: discri_train[index:index + batch_size]
        },
        on_unused_input='ignore',
        allow_input_downcast=True)

    train_model_predict = theano.function(
        [index, cost_tmp],
        [cost_this, layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            extra: extra_train[index:index + batch_size],
            discri: discri_train[index:index + batch_size]
        },
        on_unused_input='ignore',
        allow_input_downcast=True)

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False

    acc_max = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #         shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            #             if (batch_start+1)%1000==0:
            #                 print batch_start+1,  'uses ', (time.time()-mid_time)/60.0, 'min'
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start, 0.0)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses = []
                test_y = []
                test_features = []
                for i in test_batch_start:
                    test_loss, layer3_input, y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                print(
                    ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') % (epoch, minibatch_index, n_train_batches,
                                       (1 - test_score) * 100.))
                acc_nn = 1 - test_score
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                #this step is risky: if the training data is too big, then this step will make the training time twice longer
                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start, 0.0)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results = clf.predict(test_features)
                lr = linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr = lr.predict(test_features)
                corr_count = 0
                corr_count_lr = 0
                test_size = len(test_y)
                for i in range(test_size):
                    if results[i] == test_y[i]:
                        corr_count += 1
                    if results_lr[i] == test_y[i]:
                        corr_count_lr += 1
                acc_svm = corr_count * 1.0 / test_size
                acc_lr = corr_count_lr * 1.0 / test_size
                if acc_svm > acc_max:
                    acc_max = acc_svm
                    best_epoch = epoch
                if acc_lr > acc_max:
                    acc_max = acc_lr
                    best_epoch = epoch
                if acc_nn > acc_max:
                    acc_max = acc_nn
                    best_epoch = epoch
                print 'acc_nn:', acc_nn, 'acc_lr:', acc_lr, 'acc_svm:', acc_svm, ' max acc: ', acc_max, ' at epoch: ', best_epoch

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #3
0
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[256,256], batch_size=1, window_width=[4,4],
                    maxSentLength=64, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, Div_reg=0.06, update_freq=1, norm_threshold=5.0, max_truncate=40):
    maxSentLength=max_truncate+2*(window_width[0]-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    mts=T.dmatrix()
    wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width[0])
    filter_size_2=(nkerns[0], window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((maxSentLength, emb_size)).transpose()
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((maxSentLength, emb_size)).transpose()
    
    
    l_input_tensor=debug_print(Matrix_Bit_Shift(layer0_l_input[:,left_l:-right_l]), 'l_input_tensor')
    r_input_tensor=debug_print(Matrix_Bit_Shift(layer0_r_input[:,left_r:-right_r]), 'r_input_tensor')
    
    addition_l=T.sum(layer0_l_input[:,left_l:-right_l], axis=1)
    addition_r=T.sum(layer0_r_input[:,left_r:-right_r], axis=1)
    cosine_addition=cosine(addition_l, addition_r)
    eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2%
    
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b] 

    layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    
    cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
    eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2%
    
    #ibm attentive pooling at extended sentence level
    attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2)
#     attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose()
#     ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose()
#     attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose()
#     ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose()    
#     cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended)
#     eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2%      
    
    
    
    #ibm attentive pooling at original sentence level
    simi_matrix_sent=compute_simi_feature_matrix_with_matrix(layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates, length_l, length_r, maxSentLength)
    attention_vec_l=T.nnet.softmax(T.max(simi_matrix_sent, axis=1)).transpose()
    ibm_l=layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose()
    attention_vec_r=T.nnet.softmax(T.max(simi_matrix_sent, axis=0)).transpose()
    ibm_r=layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose()    
    cosine_ibm=cosine(ibm_l, ibm_r)
    eucli_ibm=1.0/(1.0+EUCLID(ibm_l, ibm_r))#25.2%    
    
    l_max_attention=T.max(attention_matrix, axis=1)
    neighborsArgSorted = T.argsort(l_max_attention)
    kNeighborsArg = neighborsArgSorted[-3:]#only average the max 3 vectors
    ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie


    r_max_attention=T.max(attention_matrix, axis=0)
    neighborsArgSorted_r = T.argsort(r_max_attention)
    kNeighborsArg_r = neighborsArgSorted_r[-3:]#only average the max 3 vectors
    rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie

    
    l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention')
    r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention')
    
    U1, W1, b1=create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para=[U1, W1, b1] 

    layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
    layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)

    vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l')
    vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r')

    
    
#     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
#     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
#     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
#     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
#     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
#     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
#     
    uni_cosine=cosine(vec_l, vec_r)
#     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
#     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
#     '''
#     linear=Linear(sum_uni_l, sum_uni_r)
#     poly=Poly(sum_uni_l, sum_uni_r)
#     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
#     rbf=RBF(sum_uni_l, sum_uni_r)
#     gesd=GESD(sum_uni_l, sum_uni_r)
#     '''
    eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2%
#     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
#     
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
#     
#     '''
#     len_l=length_l.reshape((1,1))
#     len_r=length_r.reshape((1,1))  
#     '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([vec_l, vec_r,
                                uni_cosine,eucli_1,
                                cosine_addition, eucli_addition,
#                                 cosine_sent, eucli_sent,

                                ibm_l.reshape((1, nkerns[0])), ibm_r.reshape((1, nkerns[0])), #2*nkerns[0]+
                                cosine_ibm, eucli_ibm,
                                len_l, len_r,wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=(2*nkerns[1]+2)+2 +(2*nkerns[0]+2)+2+2, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(U** 2).sum()+(W** 2).sum()+(U1** 2).sum()+(W1** 2).sum(), 'L2_reg')#+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg= Diversify_Reg(layer3.W.T)+Diversify_Reg(U[0])+Diversify_Reg(W[0])+Diversify_Reg(U1[0])+Diversify_Reg(W1[0])+Diversify_Reg(U[1])+Diversify_Reg(W[1])+Diversify_Reg(U1[1])+Diversify_Reg(W1[1])+Diversify_Reg(U[2])+Diversify_Reg(W[2])+Diversify_Reg(U1[2])+Diversify_Reg(W1[2])
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg+Div_reg*diversify_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.prop_for_posi,layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index: index + batch_size],
            wmf: wm_test[index: index + batch_size]}, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ layer1_para+layer0_para#+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
#         
#     # create a list of gradients for all model parameters
#     grads = T.grad(cost, params)
#   
#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))    

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
      
    updates=Adam(cost=cost, params=params, lr=learning_rate)  
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
#             print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    prob_i, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR=compute_map_mrr(rootPath+'test_filtered.txt', test_probs)
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                           'model %f, MRR  %f') %
                          (epoch, minibatch_index, n_train_batches,MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm=clf.decision_function(test_features)
                MAP_svm, MRR_svm=compute_map_mrr(rootPath+'test_filtered.txt', results_svm)
                
                lr=LinearRegression().fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                MAP_lr, MRR_lr=compute_map_mrr(rootPath+'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))