def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[50,50], batch_size=10, window_width=3,
                    maxSentLength=64, emb_size=50, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=33):# max_truncate can be 45
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/';
    rng = numpy.random.RandomState(23455)
#     datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'train.txt', rootPath+'test.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    datasets, vocab_size=load_SICK_corpus(rootPath+'vocab.txt', rootPath+'train_plus_dev.txt', rootPath+'test.txt', max_truncate,maxSentLength, entailment=True)
    mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    
    
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad = datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    '''
    indices_train_l=T.cast(indices_train_l, 'int32')
    indices_train_r=T.cast(indices_train_r, 'int32')
    indices_test_l=T.cast(indices_test_l, 'int32')
    indices_test_r=T.cast(indices_test_r, 'int32')
    '''


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_glove_50d.txt')
#     rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    

    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lvector()
    right_l=T.lvector()
    left_r=T.lvector()
    right_r=T.lvector()
    length_l=T.lvector()
    length_r=T.lvector()
    norm_length_l=T.dvector()
    norm_length_r=T.dvector()
    mts=T.dmatrix()
    extra=T.dmatrix()
    discri=T.dmatrix()
    cost_tmp=T.dscalar()




#     #GPU
#     index = T.iscalar()
#     x_index_l = T.imatrix('x_index_l')   # now, x is the index matrix, must be integer
#     x_index_r = T.imatrix('x_index_r')
#     y = T.ivector('y')  
#     left_l=T.iscalar()
#     right_l=T.iscalar()
#     left_r=T.iscalar()
#     right_r=T.iscalar()
#     length_l=T.iscalar()
#     length_r=T.iscalar()
#     norm_length_l=T.fscalar()
#     norm_length_r=T.fscalar()
#     #mts=T.dmatrix()
#     #wmf=T.dmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = debug_print(embeddings[x_index_l.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_l_input')
    layer0_r_input = debug_print(embeddings[x_index_r.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_r_input')
    #paras:
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b]     
    U1, W1, b1=create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para=[U1, W1, b1] 
    def loop (l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i):   
        l_input_tensor=debug_print(Matrix_Bit_Shift(l_matrix[:,l_left:-l_right]), 'l_input_tensor')
        r_input_tensor=debug_print(Matrix_Bit_Shift(r_matrix[:,r_left:-r_right]), 'r_input_tensor')
        
        addition_l=T.sum(l_matrix[:,l_left:-l_right], axis=1)
        addition_r=T.sum(r_matrix[:,r_left:-r_right], axis=1)
        cosine_addition=cosine(addition_l, addition_r)
        eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2%
        
        layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        
        cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
        eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2%
        
        attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2)
        
        l_max_attention=T.max(attention_matrix, axis=1)
        neighborsArgSorted = T.argsort(l_max_attention)
        kNeighborsArg = neighborsArgSorted[:3]#only average the min 3 vectors
        ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie
    
    
        r_max_attention=T.max(attention_matrix, axis=0)
        neighborsArgSorted_r = T.argsort(r_max_attention)
        kNeighborsArg_r = neighborsArgSorted_r[:3]#only average the min 3 vectors
        rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie
    
        
        l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention')
        r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention')
        

    
        layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
        layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
    
        vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l')
        vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r')
    
        
        
    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #     
        uni_cosine=cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
        eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #     
        len_l=norm_length_l_i.reshape((1,1))
        len_r=norm_length_r_i.reshape((1,1))  
    #     
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))  
    #     '''
        #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
        #length_gap=T.sqrt((len_l-len_r)**2)
        #layer3_input=mts
#         layer3_input_nn=T.concatenate([vec_l, vec_r,
#                                     cosine_addition, eucli_addition,
#     #                                 cosine_sent, eucli_sent,
#                                     uni_cosine,eucli_1], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
        
        output_i=T.concatenate([vec_l, vec_r,
                                    cosine_addition, eucli_addition,
    #                                 cosine_sent, eucli_sent,
                                    uni_cosine,eucli_1,
                                    mts_i.reshape((1,14)),
                                    len_l, len_r,
                                    extra_i.reshape((1,9))], axis=1)#, layer2.output, layer1.output_cosine], axis=1)    
        return output_i
    
    layer3_input, _ = theano.scan(fn=loop,
                            sequences=[left_l, right_l, layer0_l_input, left_r, right_r, layer0_r_input, mts, extra, norm_length_l, norm_length_r],
                            outputs_info=None,#[self.h0, None],
                            n_steps=batch_size)       
#l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i
#     x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
#     x_index_r = T.lmatrix('x_index_r')
#     y = T.lvector('y')  
#     left_l=T.lvector()
#     right_l=T.lvector()
#     left_r=T.lvector()
#     right_r=T.lvector()
#     length_l=T.lvector()
#     length_r=T.lvector()
#     norm_length_l=T.dvector()
#     norm_length_r=T.dvector()
#     mts=T.dmatrix()
#     extra=T.dmatrix()
#     discri=T.dmatrix()
#     cost_tmp=T.dscalar()

    
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    feature_size=2*nkerns[1]+2+2+14+2+9
    layer3_input=layer3_input.reshape((batch_size, feature_size))
    layer3=LogisticRegression(rng, input=layer3_input, n_in=feature_size, n_out=3)

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(U** 2).sum()+(W** 2).sum()+(U1** 2).sum()+(W1** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y),layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index: index + batch_size],
            right_l: testRightPad_l[index: index + batch_size],
            left_r: testLeftPad_r[index: index + batch_size],
            right_r: testRightPad_r[index: index + batch_size],
            length_l: testLengths_l[index: index + batch_size],
            length_r: testLengths_r[index: index + batch_size],
            norm_length_l: normalized_test_length_l[index: index + batch_size],
            norm_length_r: normalized_test_length_r[index: index + batch_size],
            mts: mt_test[index: index + batch_size],
            extra: extra_test[index: index + batch_size],
            discri:discri_test[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ layer1_para+layer0_para#+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
#       
#     # create a list of gradients for all model parameters
#     grads = T.grad(cost, params)
# 
#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))    

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
    
    updates=Adam(cost=cost, params=params, lr=learning_rate)
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index: index + batch_size],
            right_l: trainRightPad_l[index: index + batch_size],
            left_r: trainLeftPad_r[index: index + batch_size],
            right_r: trainRightPad_r[index: index + batch_size],
            length_l: trainLengths_l[index: index + batch_size],
            length_r: trainLengths_r[index: index + batch_size],
            norm_length_l: normalized_train_length_l[index: index + batch_size],
            norm_length_r: normalized_train_length_r[index: index + batch_size],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)

    train_model_predict = theano.function([index, cost_tmp], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index: index + batch_size],
            right_l: trainRightPad_l[index: index + batch_size],
            left_r: trainLeftPad_r[index: index + batch_size],
            right_r: trainRightPad_r[index: index + batch_size],
            length_l: trainLengths_l[index: index + batch_size],
            length_r: trainLengths_r[index: index + batch_size],
            norm_length_l: normalized_train_length_l[index: index + batch_size],
            norm_length_r: normalized_train_length_r[index: index + batch_size],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False
    
    acc_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
#         shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
#             if (batch_start+1)%1000==0:
#                 print batch_start+1,  'uses ', (time.time()-mid_time)/60.0, 'min'
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y)
                    test_features.append(layer3_input)
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+
 
                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_features=numpy.concatenate(test_features, axis=0)
                test_y=numpy.concatenate(test_y, axis=0)
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           (1-test_score) * 100.))
                acc_nn=1-test_score
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 
                #this step is risky: if the training data is too big, then this step will make the training time twice longer
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0)
                    train_y.append(y)
                    train_features.append(layer3_input)
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1
 
                train_features=numpy.concatenate(train_features, axis=0)
                train_y=numpy.concatenate(train_y, axis=0)
 
                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                lr=linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                corr_count=0
                corr_count_lr=0
                test_size=len(test_y)
                for i in range(test_size):
                    if results[i]==test_y[i]:
                        corr_count+=1
                    if results_lr[i]==test_y[i]:
                        corr_count_lr+=1
                acc_svm=corr_count*1.0/test_size
                acc_lr=corr_count_lr*1.0/test_size
                if acc_svm > acc_max:
                    acc_max=acc_svm
                    best_epoch=epoch
                if acc_lr > acc_max:
                    acc_max=acc_lr
                    best_epoch=epoch
                if acc_nn > acc_max:
                    acc_max=acc_nn
                    best_epoch=epoch
                print  'acc_nn:', acc_nn, 'acc_lr:', acc_lr, 'acc_svm:', acc_svm, ' max acc: ',    acc_max , ' at epoch: ', best_epoch  

            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[256,256], batch_size=1, window_width=[4,4],
                    maxSentLength=64, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, Div_reg=0.06, update_freq=1, norm_threshold=5.0, max_truncate=40):
    maxSentLength=max_truncate+2*(window_width[0]-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    mts=T.dmatrix()
    wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width[0])
    filter_size_2=(nkerns[0], window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((maxSentLength, emb_size)).transpose()
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((maxSentLength, emb_size)).transpose()
    
    
    l_input_tensor=debug_print(Matrix_Bit_Shift(layer0_l_input[:,left_l:-right_l]), 'l_input_tensor')
    r_input_tensor=debug_print(Matrix_Bit_Shift(layer0_r_input[:,left_r:-right_r]), 'r_input_tensor')
    
    addition_l=T.sum(layer0_l_input[:,left_l:-right_l], axis=1)
    addition_r=T.sum(layer0_r_input[:,left_r:-right_r], axis=1)
    cosine_addition=cosine(addition_l, addition_r)
    eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2%
    
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b] 

    layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    
    cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
    eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2%
    
    #ibm attentive pooling at extended sentence level
    attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2)
#     attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose()
#     ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose()
#     attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose()
#     ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose()    
#     cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended)
#     eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2%      
    
    
    
    #ibm attentive pooling at original sentence level
    simi_matrix_sent=compute_simi_feature_matrix_with_matrix(layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates, length_l, length_r, maxSentLength)
    attention_vec_l=T.nnet.softmax(T.max(simi_matrix_sent, axis=1)).transpose()
    ibm_l=layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose()
    attention_vec_r=T.nnet.softmax(T.max(simi_matrix_sent, axis=0)).transpose()
    ibm_r=layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose()    
    cosine_ibm=cosine(ibm_l, ibm_r)
    eucli_ibm=1.0/(1.0+EUCLID(ibm_l, ibm_r))#25.2%    
    
    l_max_attention=T.max(attention_matrix, axis=1)
    neighborsArgSorted = T.argsort(l_max_attention)
    kNeighborsArg = neighborsArgSorted[-3:]#only average the max 3 vectors
    ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie


    r_max_attention=T.max(attention_matrix, axis=0)
    neighborsArgSorted_r = T.argsort(r_max_attention)
    kNeighborsArg_r = neighborsArgSorted_r[-3:]#only average the max 3 vectors
    rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie

    
    l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention')
    r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention')
    
    U1, W1, b1=create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para=[U1, W1, b1] 

    layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
    layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)

    vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l')
    vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r')

    
    
#     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
#     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
#     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
#     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
#     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
#     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
#     
    uni_cosine=cosine(vec_l, vec_r)
#     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
#     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
#     '''
#     linear=Linear(sum_uni_l, sum_uni_r)
#     poly=Poly(sum_uni_l, sum_uni_r)
#     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
#     rbf=RBF(sum_uni_l, sum_uni_r)
#     gesd=GESD(sum_uni_l, sum_uni_r)
#     '''
    eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2%
#     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
#     
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
#     
#     '''
#     len_l=length_l.reshape((1,1))
#     len_r=length_r.reshape((1,1))  
#     '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([vec_l, vec_r,
                                uni_cosine,eucli_1,
                                cosine_addition, eucli_addition,
#                                 cosine_sent, eucli_sent,

                                ibm_l.reshape((1, nkerns[0])), ibm_r.reshape((1, nkerns[0])), #2*nkerns[0]+
                                cosine_ibm, eucli_ibm,
                                len_l, len_r,wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=(2*nkerns[1]+2)+2 +(2*nkerns[0]+2)+2+2, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(U** 2).sum()+(W** 2).sum()+(U1** 2).sum()+(W1** 2).sum(), 'L2_reg')#+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg= Diversify_Reg(layer3.W.T)+Diversify_Reg(U[0])+Diversify_Reg(W[0])+Diversify_Reg(U1[0])+Diversify_Reg(W1[0])+Diversify_Reg(U[1])+Diversify_Reg(W[1])+Diversify_Reg(U1[1])+Diversify_Reg(W1[1])+Diversify_Reg(U[2])+Diversify_Reg(W[2])+Diversify_Reg(U1[2])+Diversify_Reg(W1[2])
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg+Div_reg*diversify_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.prop_for_posi,layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index: index + batch_size],
            wmf: wm_test[index: index + batch_size]}, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ layer1_para+layer0_para#+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
#         
#     # create a list of gradients for all model parameters
#     grads = T.grad(cost, params)
#   
#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))    

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
      
    updates=Adam(cost=cost, params=params, lr=learning_rate)  
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
#             print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    prob_i, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR=compute_map_mrr(rootPath+'test_filtered.txt', test_probs)
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                           'model %f, MRR  %f') %
                          (epoch, minibatch_index, n_train_batches,MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm=clf.decision_function(test_features)
                MAP_svm, MRR_svm=compute_map_mrr(rootPath+'test_filtered.txt', results_svm)
                
                lr=LinearRegression().fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                MAP_lr, MRR_lr=compute_map_mrr(rootPath+'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=50, char_nkerns=4, batch_size=1, window_width=[2, 5],
                    emb_size=50, char_emb_size=4, hidden_size=200,
                    margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, 
                    max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=21, 
                    neg_all=100, train_size=200, test_size=200, mark='_forfun'):  #train_size=75909, test_size=17386
#     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files=['annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.txt']

    rng = numpy.random.RandomState(23455)
    datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_train(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size, mark)#max_char_len, max_des_len, max_relation_len, max_Q_len

    
    print 'vocab_size:', vocab_size, 'char_size:', char_size

    train_data=datasets
#     valid_data=datasets[1]
    test_data=datasets_test
#     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
#     
    train_pos_entity_char=train_data[0]
    train_pos_entity_des=train_data[1]
    train_relations=train_data[2]
    train_entity_char_lengths=train_data[3]
    train_entity_des_lengths=train_data[4]
    train_relation_lengths=train_data[5]
    train_mention_char_ids=train_data[6]
    train_remainQ_word_ids=train_data[7]
    train_mention_char_lens=train_data[8]
    train_remainQ_word_len=train_data[9]
    train_entity_scores=train_data[10]

    test_pos_entity_char=test_data[0]
    test_pos_entity_des=test_data[1]
    test_relations=test_data[2]
    test_entity_char_lengths=test_data[3]
    test_entity_des_lengths=test_data[4]
    test_relation_lengths=test_data[5]
    test_mention_char_ids=test_data[6]
    test_remainQ_word_ids=test_data[7]
    test_mention_char_lens=test_data[8]
    test_remainQ_word_len=test_data[9]
    test_entity_scores=test_data[10]
# 
#     test_pos_entity_char=test_data[0]       #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
#     test_pos_entity_des=test_data[1]        #matrix, each row for a examle: 20*2*51
#     test_relations=test_data[2]             #matrix, each row for a example: 5*51
#     test_entity_char_lengths=test_data[3]   #matrix, each row for a example: 3*2*51  (three valies for one entity)
#     test_entity_des_lengths=test_data[4]    #matrix, each row for a example: 3*2*51  (three values for one entity)
#     test_relation_lengths=test_data[5]      #matrix, each row for a example: 3*51
#     test_mention_char_ids=test_data[6]      #matrix, each row for a mention: 40
#     test_remainQ_word_ids=test_data[7]      #matrix, each row for a question: 30
#     test_mention_char_lens=test_data[8]     #matrix, each three values for a mention: 3
#     test_remainQ_word_len=test_data[9]      #matrix, each three values for a remain question: 3
    

    train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
           len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)]
    if sum(train_sizes)/len(train_sizes)!=train_size:
        print 'weird size:', train_sizes
        exit(0)

    test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)]
    if sum(test_sizes)/len(test_sizes)!=test_size:
        print 'weird size:', test_sizes
        exit(0)

    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)
    
    indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char)
    indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des)
    indices_train_relations=pythonList_into_theanoIntMatrix(train_relations)
    indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths)
    indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths)
    indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths)
    indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids)
    indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids)
    indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens)
    indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len)   
    indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) 
    
#     indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char)
#     indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des)
#     indices_test_relations=pythonList_into_theanoIntMatrix(test_relations)
#     indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths)
#     indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths)
#     indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths)
#     indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids)
#     indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids)
#     indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens)
#     indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len)   
#     indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores)

    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      

    char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
    index = T.lscalar()
    chosed_indices=T.lvector()
    
    ent_char_ids_M = T.lmatrix()   
    ent_lens_M = T.lmatrix()
    men_char_ids_M = T.lmatrix()  
    men_lens_M=T.lmatrix()
    rel_word_ids_M=T.lmatrix()
    rel_word_lens_M=T.lmatrix()
    desH_word_ids_M=T.lmatrix()
    desH_word_lens_M=T.lmatrix()
#     desT_word_ids_M=T.lmatrix()
#     desT_word_lens_M=T.lmatrix()
    q_word_ids_M=T.lmatrix()
    q_word_lens_M=T.lmatrix()
    ent_scores=T.dvector()

#max_char_len, max_des_len, max_relation_len, max_Q_len
#     ent_men_ishape = (char_emb_size, max_char_len)  # this is the size of MNIST images
#     rel_ishape=(emb_size, max_relation_len)
#     des_ishape=(emb_size, max_des_len)
#     q_ishape=(emb_size, max_Q_len)
    
    filter_size=(emb_size,window_width[0])
    char_filter_size=(char_emb_size, window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1])
    word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b]
    char_conv_W_into_matrix=char_conv_W.reshape((char_conv_W.shape[0], char_conv_W.shape[2]*char_conv_W.shape[3]))
    q_rel_conv_W_into_matrix=q_rel_conv_W.reshape((q_rel_conv_W.shape[0], q_rel_conv_W.shape[2]*q_rel_conv_W.shape[3]))
    q_desH_conv_W_into_matrix=q_desH_conv_W.reshape((q_desH_conv_W.shape[0], q_desH_conv_W.shape[2]*q_desH_conv_W.shape[3]))
#     load_model_from_file(rootPath, params, '')

    def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f,desH_word_ids_f,
                       desH_word_lens_f,
                       men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f):
        

#         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
#         desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
        #ent_mention
        ent_char_conv = Conv_with_input_para(rng, input=ent_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng, input=men_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng, input=rel_word_input,
                image_shape=(batch_size, 1, emb_size, max_relation_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        #q_desH
        q_desH_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
        desH_conv = Conv_with_input_para(rng, input=desH_word_input,
                image_shape=(batch_size, 1, emb_size, max_des_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
#         #q_desT
#         q_desT_conv = Conv_with_input_para(rng, input=q_word_input,
#                 image_shape=(batch_size, 1, emb_size, max_Q_len),
#                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
#         desT_conv = Conv_with_input_para(rng, input=desT_word_input,
#                 image_shape=(batch_size, 1, emb_size, max_des_len),
#                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
    #     ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output')
    #     men_char_output=debug_print(men_char_conv.output, 'men_char.output')
        
        
        
        ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2])
        men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2])
        
#         q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2])
        q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, 
                                                left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, 
                                                dim=max_Q_len+filter_size[1]-1, topk=2)
        
        
        q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2])
        
#         q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2])
#         desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2])    
        
        
        overall_simi=(cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\
                    cosine(q_rel_pool.topk_max_pooling, rel_conv_pool.output_maxpooling)+\
                    0.1*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling))/3.0

#                     cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi
    
    simi_list, updates = theano.scan(
        SimpleQ_matches_Triple,
                sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M,desH_word_ids_M,
                   desH_word_lens_M,
                   men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M])
    
    simi_list+=0.5*ent_scores
    
    posi_simi=simi_list[0]
    nega_simies=simi_list[1:]
    loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) 
    loss_simi=T.mean(loss_simi_list)

    

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((char_embeddings** 2).sum()+(embeddings** 2).sum()+(char_conv_W** 2).sum()+(q_rel_conv_W** 2).sum()+(q_desH_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg= Diversify_Reg(char_conv_W_into_matrix)+Diversify_Reg(q_rel_conv_W_into_matrix)+Diversify_Reg(q_desH_conv_W_into_matrix)
    cost=loss_simi+L2_weight*L2_reg+Div_reg*diversify_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    



    test_model = theano.function([ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, desH_word_ids_M, desH_word_lens_M,
                                  q_word_ids_M, q_word_lens_M, ent_scores], [loss_simi, simi_list],on_unused_input='ignore')
#           givens={
#             ent_char_ids_M : test_pos_entity_char[index].reshape((length_per_example_test[index], max_char_len)),  
#             ent_lens_M : test_entity_char_lengths[index].reshape((length_per_example_test[index], 3)),
#             men_char_ids_M : test_mention_char_ids[index].reshape((length_per_example_test[index], max_char_len)),  
#             men_lens_M : test_mention_char_lens[index].reshape((length_per_example_test[index], 3)),
#             rel_word_ids_M : test_relations[index].reshape((length_per_example_test[index], max_relation_len)),  
#             rel_word_lens_M : test_relation_lengths[index].reshape((length_per_example_test[index], 3)),
#             desH_word_ids_M : test_pos_entity_des[index].reshape((length_per_example_test[index], max_des_len)), 
#             desH_word_lens_M : test_entity_des_lengths[index].reshape((length_per_example_test[index], 3)),
# #             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], 
# #             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
#             q_word_ids_M : test_remainQ_word_ids[index].reshape((length_per_example_test[index], max_Q_len)), 
#             q_word_lens_M : test_remainQ_word_len[index].reshape((length_per_example_test[index], 3)),
#             ent_scores : test_entity_scores[index]},
                                  
    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    #+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
#         updates.append((acc_i, acc))    
        if param_i == embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size)))))   #Ada
        elif param_i == char_embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(char_emb_size)))))   #AdaGrad
        else:
            updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc)) 
  
    train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates,
          givens={
            ent_char_ids_M : indices_train_pos_entity_char[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)),  
            ent_lens_M : indices_train_entity_char_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            men_char_ids_M : indices_train_mention_char_ids[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)),  
            men_lens_M : indices_train_mention_char_lens[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)),  
            rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            desH_word_ids_M : indices_train_pos_entity_des[index].reshape((neg_all, max_des_len))[chosed_indices].reshape((train_neg_size, max_des_len)), 
            desH_word_lens_M : indices_train_entity_des_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
#             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], 
#             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
            q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), 
            q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            ent_scores : indices_train_entity_scores[index][chosed_indices]
            
            }, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    best_test_accu=0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0


        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
 
            minibatch_index=minibatch_index+1
            #print batch_start
            sample_indices=[0]+random.sample(range(1, neg_all), train_neg_size-1)
            loss_simi_i, cost_i= train_model(batch_start, sample_indices)
#             if batch_start%1==0:
#                 print batch_start, '\t loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
#                 store_model_to_file(rootPath, params)
 
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+'\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)
#             
            if iter % n_train_batches == 0:
                 
                test_loss=[]
                succ=0
                for i in range(test_size):
#                     print 'testing', i, '...'
                    #prepare data
                    test_ent_char_ids_M= numpy.asarray(test_pos_entity_char[i], dtype='int64').reshape((length_per_example_test[i], max_char_len))  
                    test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int64').reshape((length_per_example_test[i], max_char_len))
                    test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int64').reshape((length_per_example_test[i], max_relation_len))  
                    test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int64').reshape((length_per_example_test[i], max_des_len))
                    test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int64').reshape((length_per_example_test[i], max_Q_len))
                    test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX)
             
             
             
             
                                
                    loss_simi_i,simi_list_i=test_model(test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M,
                                                       test_desH_word_ids_M, test_desH_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores)
#                     print 'simi_list_i:', simi_list_i[:10]
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0]>=max(simi_list_i[1:]):
                        succ+=1
#                     print 'testing', i, '...acc:', succ*1.0/(i+1)
                succ=succ*1.0/test_size
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                           'model %f') %
                          (epoch, minibatch_index, n_train_batches,succ))

                if best_test_accu< succ:
                    best_test_accu=succ
                    store_model_to_file(rootPath, params, mark)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock() 

            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #4
0
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=0.0000001, extra_size=4, emb_size=300, batch_size=50, filter_size=[3,3], maxSentLen=40, hidden_size=[300,300]):

    model_options = locals().copy()
    print "model options", model_options

    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results


    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id  =load_SNLI_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)

    train_sents_l=np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l=np.asarray(test_sents_l, dtype='int32')

    train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r=np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    test_sents_r=np.asarray(test_sents_r, dtype='int32')

    train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX)


    train_labels_store=np.asarray(all_labels[0], dtype='int32')
    dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    test_labels_store=np.asarray(test_labels, dtype='int32')

    train_size=len(train_labels_store)
    dev_size=len(dev_labels_store)
    test_size=len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size=len(word2id)+1


    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable


    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)



    '''create_AttentiveConv_params '''
    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]))
    conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))

    NN_para=[conv_W, conv_b,conv_W_context]

    '''
    attentive convolution function
    '''

    attentive_conv_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_input_r,
             mask_matrix = sents_mask_l,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
             image_shape_r = (batch_size, 1, hidden_size[0], maxSentLen),
             filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,hidden_size[0], 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    "form input to LR classifier"
    LR_input = T.concatenate([attentive_sent_embeddings_l,attentive_sent_embeddings_r],axis=1)
    LR_input_size=2*hidden_size[1]

    U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a, LR_b]


    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.



    '''
    testing
    '''

    test_preds = T.argmax(layer_LR.p_y_given_x, axis=1)
    transfered_preds = T.eq(test_preds, 2)
    test_error = T.mean(T.neq(transfered_preds, labels))




    params = [init_embeddings]+NN_para+LR_para

    cost=loss

    updates =   Gradient_Cost_Para(cost,params, learning_rate)


    train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], [test_error,transfered_preds], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    n_dev_batches=dev_size/batch_size
    dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]


    max_acc_dev=0.0
    max_acc_test=0.0
    max_f1=0.0

    cost_i=0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu=0

        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]
            cost_i+= train_model(
                                train_sents_l[train_id_batch],
                                train_masks_l[train_id_batch],
                                train_sents_r[train_id_batch],
                                train_masks_r[train_id_batch],
                                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter%100==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()

                pred_labels =[]
                gold_labels =[]
                error_sum=0.0
                for idd, test_batch_id in enumerate(test_batch_start): # for each test batch
                    error_i, pred_labels_i=test_model(
                            test_sents_l[test_batch_id:test_batch_id+batch_size],
                            test_masks_l[test_batch_id:test_batch_id+batch_size],
                            test_sents_r[test_batch_id:test_batch_id+batch_size],
                            test_masks_r[test_batch_id:test_batch_id+batch_size],
                            test_labels_store[test_batch_id:test_batch_id+batch_size])

                    error_sum+=error_i
                    pred_labels+=list(pred_labels_i)
                    gold_labels+= list(test_labels_store[test_batch_id:test_batch_id+batch_size])
                test_acc=1.0-error_sum/(len(test_batch_start))
                test_f1= f1_score(gold_labels, pred_labels, average='binary')

                if test_acc > max_acc_test:
                    max_acc_test=test_acc
                if test_f1 > max_f1:
                    max_f1 = test_f1

                    # store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_five_copies_'+str(max_acc_test), params)


                print '\t\tcurrent acc:', test_acc,' ; ','\t\tmax_acc:', max_acc_test, '\t\t test_f1:', test_f1, '\t\tmax F1:', max_f1



        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Beispiel #5
0
        word2id, config['type_size'], config['describ_max_len'])
    emb_root = '/scratch/wyin3/dickens_save_dataset/LORELEI/multi-lingual-emb/'
    print('loading bilingual embeddings....')
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)

    vocab_size = len(word2id) + 1
    rand_values = np.random.RandomState(1234).normal(
        0.0, 0.01,
        (vocab_size,
         config['emb_size']))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(config['emb_size']), dtype=np.float32)
    id2word = {y: x for x, y in word2id.items()}
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = torch.Tensor(rand_values)

    print("build model...")
    model, loss_function, optimizer = build_model(config['emb_size'],
                                                  config['hidden_size'],
                                                  vocab_size, 12, embeddings,
                                                  config['lr'],
                                                  config['batch_size'])
    print("training...")
    # train_start = time.time()
    train(all_sentences, all_masks, all_labels, label_sent, label_mask,
          config['epoch_num'], model, loss_function, optimizer)
    '''
    1, whether the embeddings are trained
    2, loss function和theaao不同
Beispiel #6
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    emb_size=40,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[40, 40]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_il6_with_BBN(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_word2vec_given_file([
        emb_root + 'IL6-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL6-cca-wiki-lorelei-d40.IL6.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    bow_mean_des = bow_des / T.sum(des_mask, axis=1).dimshuffle(0, 'x')

    # conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    # conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]))
    # NN_para = [conv_W, conv_b, conv_W2, conv_b2]
    #
    # conv_model = Conv_with_Mask(rng, input_tensor3=common_input,
    #          mask_matrix = sents_mask,
    #          image_shape=(batch_size, 1, emb_size, maxSentLen),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    # sent_embeddings=conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    #
    # conv_model2 = Conv_with_Mask(rng, input_tensor3=common_input,
    #          mask_matrix = sents_mask,
    #          image_shape=(batch_size, 1, emb_size, maxSentLen),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2)    #mutiple mask with the conv_out to set the features by UNK to zero
    # sent_embeddings2=conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    #
    # LR_input = T.concatenate([sent_embeddings,sent_embeddings2, bow_emb], axis=1)
    # LR_input_size = hidden_size[0]*2+emb_size
    # #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    # U_a = create_ensemble_para(rng, 12, LR_input_size) # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((12,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    # LR_para=[U_a, LR_b]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    # conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [conv_W, conv_b]
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=des_input,
        mask_matrix=des_mask,
        image_shape=(type_size, 1, emb_size, describ_max_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    # repeat_text_tensor3 = T.repeat(common_input, type_size, axis=0)
    # repeat_des_tensor3 = T.repeat(des_input, batch_size, axis=0)
    #
    # repeat_text_mask = T.repeat(sents_mask, type_size, axis=0)
    # repeat_des_mask = T.repeat(des_mask, batch_size, axis=0)
    #
    #
    # attentive_conv_layer = Attentive_Conv_for_Pair(rng,
    #         origin_input_tensor3=repeat_text_tensor3,
    #         origin_input_tensor3_r = repeat_des_tensor3,
    #         input_tensor3=repeat_text_tensor3,
    #         input_tensor3_r = repeat_des_tensor3,
    #          mask_matrix = repeat_text_mask,
    #          mask_matrix_r = repeat_des_mask,
    #          image_shape=(batch_size*type_size, 1, emb_size, maxSentLen),
    #          image_shape_r = (batch_size*type_size, 1, emb_size, describ_max_len),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
    #          filter_shape_context=(hidden_size[0], 1,emb_size, 1),
    #          W=conv_att_W, b=conv_att_b,
    #          W_context=conv_W_context, b_context=conv_b_context)
    # sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l
    # des_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_r
    repeat_sent_emb = T.repeat(sent_embeddings, type_size, axis=0)
    repeat_des_emb = T.repeat(sent_embeddings2.dimshuffle('x', 0, 1),
                              batch_size,
                              axis=0).reshape(
                                  (batch_size * type_size, hidden_size[0]))
    repeat_des_bow = T.repeat(bow_mean_des.dimshuffle('x', 0, 1),
                              batch_size,
                              axis=0).reshape(
                                  (batch_size * type_size, emb_size))

    score_input = T.concatenate([
        repeat_sent_emb, repeat_des_emb, repeat_des_bow,
        repeat_sent_emb * repeat_des_bow
    ],
                                axis=1)
    U_a = create_ensemble_para(rng, 1, 2 * hidden_size[0] +
                               2 * emb_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]

    att_score_list = T.nnet.sigmoid(score_input.dot(U_a))
    att_score_matrix = att_score_list.reshape((batch_size, type_size))
    att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix,
                           att_score_matrix)
    att_loss = -T.mean(T.log(att_prob_pos))

    # score_matrix = T.nnet.sigmoid(normalize_matrix_rowwise(bow_emb).dot(normalize_matrix_rowwise(bow_des).T)) #(batch_size, type_size)
    # prob_pos = T.where( labels < 1, 1.0-score_matrix, score_matrix)
    # loss = -T.mean(T.log(prob_pos))

    # loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = NN_para + LR_para  # put all model parameters together
    cost = att_loss  #+1e-4*((U_a**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''
    binarize_prob = T.where(att_score_matrix > 0.1, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        [binarize_prob, att_score_matrix],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    cost_i = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch], label_sent,
                                  label_mask)

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels, prob_matrix_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    # print 'prob_matrix_i:'
                    # print prob_matrix_i
                    # exit(0)
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.01, n_epochs=4, emb_size=300, batch_size=50, describ_max_len=20, type_size=12,filter_size=[3,5], maxSentLen=100, hidden_size=[300,300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/'
    # test_file_path = '/save/wenpeng/datasets/LORELEI/il9-eng/il9-eng-setE-as-test-input_ner_filtered_w2.txt'
    # output_file_path = '/save/wenpeng/datasets/LORELEI/il9-eng/il9-eng_system_output_epoch4.json'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il10-eng/il10-eng-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il10-eng/il10-eng_system_output_epoch4.json'
    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id={}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels,word2id = load_trainingData_types(word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels,word2id = load_trainingData_types_plus_others(word2id, maxSentLen)
    test_sents, test_masks, test_lines,word2id = load_official_testData(word2id, maxSentLen, test_file_path)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len)
    label_sent=np.asarray(label_sent, dtype='int32')
    label_mask=np.asarray(label_mask, dtype=theano.config.floatX)


    train_p1_sents=np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks=np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels=np.asarray(train_p1_labels, dtype='int32')
    train_p1_size=len(train_p1_labels)

    train_p2_sents=np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks=np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels=np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size=len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents=np.concatenate([train_p1_sents,train_p2_sents],axis=0)
    train_masks=np.concatenate([train_p1_masks,train_p2_masks],axis=0)
    train_labels=np.concatenate([train_p1_labels,train_p2_labels],axis=0)
    train_size=train_p1_size+train_p2_size

    test_sents=np.asarray(test_sents, dtype='int32')
    test_masks=np.asarray(test_masks, dtype=theano.config.floatX)
    # test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size=len(test_sents)

    vocab_size=  len(word2id)+1 # add one zero pad index

    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable


    #now, start to build the input form of the model
    sents_id_matrix=T.imatrix('sents_id_matrix')
    sents_mask=T.fmatrix('sents_mask')
    labels=T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input=embeddings[sents_id_matrix.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input*sents_mask.dimshuffle(0,'x',1),axis=2)
    repeat_common_input = T.repeat(normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen)


    des_input=embeddings[des_id_matrix.flatten()].reshape((type_size,describ_max_len, emb_size)).dimshuffle(0,2,1)
    bow_des = T.sum(des_input*des_mask.dimshuffle(0,'x',1),axis=2) #(tyope_size, emb_size)
    repeat_des_input = T.tile(normalize_tensor3_colwise(des_input), (batch_size,1,1))#(batch_size*type_size, emb_size, maxsentlen)


    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]))
    conv_W_context2, conv_b_context2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [conv_att_W, conv_att_b,conv_W_context,conv_att_W2, conv_att_b2,conv_W_context2]

    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(rng, input_tensor3=common_input,
             mask_matrix = sents_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings=conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(rng, input_tensor3=common_input,
             mask_matrix = sents_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2)    #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2=conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size


    '''
    GRU
    '''
    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para=[U1, W1, b1]     #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer=GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,  hidden_size[0], U1, W1, b1)
    gru_sent_embeddings=gru_layer.output_sent_rep  # (batch_size, hidden_size)



    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(rng,
            origin_input_tensor3=common_input,
            origin_input_tensor3_r = common_input,
            input_tensor3=common_input,
            input_tensor3_r = common_input,
             mask_matrix = sents_mask,
             mask_matrix_r = sents_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[0], 1,emb_size, 1),
             W=conv_att_W, b=conv_att_b,
             W_context=conv_W_context, b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(rng,
            origin_input_tensor3=common_input,
            origin_input_tensor3_r = common_input,
            input_tensor3=common_input,
            input_tensor3_r = common_input,
             mask_matrix = sents_mask,
             mask_matrix_r = sents_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
             filter_shape_context=(hidden_size[0], 1,emb_size, 1),
             W=conv_att_W2, b=conv_att_b2,
             W_context=conv_W_context2, b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l

    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1=HiddenLayer(rng, input=bow_des, n_in=emb_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh)
    des_rep_hidden = HL_layer_1.output #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(des_rep_hidden.T)) #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))


    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(cosine_scores) #(batch_size, type_size)

    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(repeat_common_input.dimshuffle(0,2,1),repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape((batch_size*type_size,maxSentLen*describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,-30:] # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape((batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)





    acnn_LR_input = T.concatenate([dot_dnn_dataless_1, dot_dnn_dataless_2,cosine_score_matrix,top_k_score_matrix,sent_embeddings,sent_embeddings2, gru_sent_embeddings,sent_att_embeddings,sent_att_embeddings2, bow_emb], axis=1)
    acnn_LR_input_size = hidden_size[0]*5+emb_size+4*type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng,acnn_LR_input_size, 12)
    acnn_LR_para=[acnn_U_a, acnn_LR_b]
    acnn_layer_LR=LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b) #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where( labels < 1, 1.0-acnn_score_matrix, acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng,acnn_LR_input_size, 16)
    acnn_other_LR_para=[acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR=LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=16, W=acnn_other_U_a, b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(acnn_other_layer_LR.before_softmax.reshape((batch_size*4,4)) )
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape((batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[T.repeat(T.arange(batch_size), 4), T.tile(T.arange(4), (batch_size)), other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))




    params = multiCNN_para  + GRU_NN_para   +ACNN_para +acnn_LR_para + HL_layer_1_params# put all model parameters together
    cost=acnn_loss+   1e-4*((conv_W**2).sum()+(conv_W2**2).sum()+(conv_att_W**2).sum()+(conv_att_W2**2).sum())
    updates =   Gradient_Cost_Para(cost,params, learning_rate)


    other_paras = params+acnn_other_LR_para
    cost_other = cost  + acnn_other_field_loss
    other_updates =   Gradient_Cost_Para(cost_other,other_paras, learning_rate)

    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix#T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores#0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3 #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function([sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    train_p2_model = theano.function([sents_id_matrix, sents_mask, labels,des_id_matrix, des_mask,other_labels], cost_other, updates=other_updates,allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask, des_id_matrix, des_mask], [binarize_prob,ensemble_scores,sum_tensor3], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    n_train_p2_batches=train_p2_size/batch_size
    train_p2_batch_start=list(np.arange(n_train_p2_batches)*batch_size)+[train_p2_size-batch_size]
    n_test_batches=test_size/batch_size
    n_test_remain=test_size%batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    # max_meanf1_test=0.0
    # max_weightf1_test=0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i=0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu=0

        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]

            cost_i+= train_p1_model(
                                train_sents[train_id_batch],
                                train_masks[train_id_batch],
                                train_labels[train_id_batch],
                                label_sent,
                                label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id+batch_size]
                other_cost_i+=train_p2_model(
                                    train_p2_sents[train_p2_id_batch],
                                    train_p2_masks[train_p2_id_batch],
                                    train_p2_labels[train_p2_id_batch],
                                    label_sent,
                                    label_mask,
                                    train_p2_other_labels[train_p2_id_batch]
                                    )
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if  iter%20==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter),str(other_cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()

                pred_types = []
                pred_confs = []
                pred_others = []
                for i, test_batch_id in enumerate(test_batch_start): # for each test batch
                    pred_types_i, pred_conf_i, pred_fields_i=test_model(
                                test_sents[test_batch_id:test_batch_id+batch_size],
                                test_masks[test_batch_id:test_batch_id+batch_size],
                                label_sent,
                                label_mask
                                )
                    if i < len(test_batch_start)-1:
                        pred_types.append(pred_types_i)
                        pred_confs.append(pred_conf_i)
                        pred_others.append(pred_fields_i)
                    else:
                        pred_types.append(pred_types_i[-n_test_remain:])
                        pred_confs.append(pred_conf_i[-n_test_remain:])
                        pred_others.append(pred_fields_i[-n_test_remain:])
                pred_types = np.concatenate(pred_types, axis=0)
                pred_confs = np.concatenate(pred_confs, axis=0)
                pred_others = np.concatenate(pred_others, axis=0)
                # mean_frame = generate_2018_official_output_english(test_lines, output_file_path, pred_types, pred_confs, pred_others, min_mean_frame)
                mean_frame = generate_2018_official_output(test_lines, output_file_path, pred_types, pred_confs, pred_others, min_mean_frame)
                if mean_frame < min_mean_frame:
                    min_mean_frame = mean_frame
                print '\t\t\t test  over, min_mean_frame:', min_mean_frame




        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=1e-5,
                    extra_size=4,
                    emb_size=300,
                    batch_size=100,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    max_term_len=4,
                    p_mode='conc'):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1, all_word2, all_word1_mask, all_word2_mask, all_labels, all_extra, word2id = load_wordnet_hyper_vs_all_with_words(
        maxlen=maxSentLen, wordlen=max_term_len
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1, test_word2, test_word1_mask, test_word2_mask, test_labels, test_extra, word2id, group_size_list = load_EVAlution_hyper_vs_all_with_allDefComb(
        maxSentLen, word2id, wordlen=max_term_len)
    # store_word2id(word2id, '/save/wenpeng/datasets/EVALution/HyperDef_label_4ways_conc_best_para_word2id.pkl')
    # exit(0)
    total_size = len(all_sentences_l)
    hold_test_size = 10000
    train_size = total_size - hold_test_size
    test_size = len(test_sents_l)

    train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_word1 = np.asarray(all_word1[:train_size], dtype='int32')
    train_word2 = np.asarray(all_word2[:train_size], dtype='int32')
    test_word1 = np.asarray(test_word1, dtype='int32')
    test_word2 = np.asarray(test_word2, dtype='int32')

    train_word1_mask = np.asarray(all_word1_mask[:train_size],
                                  dtype=theano.config.floatX)
    train_word2_mask = np.asarray(all_word2_mask[:train_size],
                                  dtype=theano.config.floatX)
    test_word1_mask = np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask = np.asarray(test_word2_mask, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[:train_size], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    train_extra = np.asarray(all_extra[:train_size],
                             dtype=theano.config.floatX)
    test_extra = np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)

    print 'train size: ', train_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape(
        (batch_size, word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape(
        (batch_size, word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1 * word1_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    word2_embedding = T.sum(embed_word2 * word2_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=word1_mask,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([
        tt_embeddings_l, tt_embeddings_r, tt_embeddings_l * tt_embeddings_r,
        tt_embeddings_l - tt_embeddings_r
    ],
                         axis=1)

    term_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_input_r,
        mask_matrix=word1_mask,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([
        td_embeddings_l, td_embeddings_r, td_embeddings_l * td_embeddings_r,
        td_embeddings_l - td_embeddings_r
    ],
                         axis=1)

    def_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=sents_mask_l,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([
        dt_embeddings_l, dt_embeddings_r, dt_embeddings_l * dt_embeddings_r,
        dt_embeddings_l - dt_embeddings_r
    ],
                         axis=1)

    def_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([
        dd_embeddings_l, dd_embeddings_r, dd_embeddings_l * dd_embeddings_r,
        dd_embeddings_l - dd_embeddings_r
    ],
                         axis=1)

    if p_mode == 'conc':
        p = T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4 * 4 * hidden_size[1]
    else:
        p = T.max(T.concatenate([
            p_ww.dimshuffle('x', 0, 1),
            p_wd.dimshuffle('x', 0, 1),
            p_dw.dimshuffle('x', 0, 1),
            p_dd.dimshuffle('x', 0, 1)
        ],
                                axis=0),
                  axis=0)
        p_len = 4 * hidden_size[1]

    # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    # HL_input_size=p_len+1+1
    #
    # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh)

    "form input to LR classifier"
    LR_input = T.concatenate([
        p,
        cosine_matrix1_matrix2_rowwise(word1_embedding,
                                       word2_embedding).dimshuffle(0, 'x'),
        extra.dimshuffle(0, 'x')
    ],
                             axis=1)
    LR_input_size = p_len + 1 + 1
    # LR_input = HL_layer_1.output
    # LR_input_size = hidden_size[1]

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng,
        input=LR_input,
        n_in=LR_input_size,
        n_out=2,
        W=U_a,
        b=LR_b,
        bias=0.25
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()

    params = NN_para + LR_para  #[init_embeddings]

    cost = loss  #+L2_weight*L2_reg

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra, labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    test_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra
    ], [layer_LR.y_pred, layer_LR.prop_for_posi],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    if n_test_remain != 0:
        test_batch_start = list(
            np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    else:
        test_batch_start = list(np.arange(n_test_batches) * batch_size)

    # max_acc_dev=0.0
    max_ap_test = 0.0
    max_ap_topk_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(
                train_sents_l[train_id_batch], train_masks_l[train_id_batch],
                train_sents_r[train_id_batch], train_masks_r[train_id_batch],
                train_word1[train_id_batch], train_word2[train_id_batch],
                train_word1_mask[train_id_batch],
                train_word2_mask[train_id_batch], train_extra[train_id_batch],
                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                probs = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    pred_i, prob_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_word1[test_batch_id:test_batch_id + batch_size],
                        test_word2[test_batch_id:test_batch_id + batch_size],
                        test_word1_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_word2_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_extra[test_batch_id:test_batch_id + batch_size])

                    # error_sum+=error_i
                    pred_labels += list(pred_i)
                    probs += list(prob_i)

                # print len(test_sents_l), len(probs)
                if n_test_remain != 0:
                    probs = probs[:(len(test_batch_start) - 1) *
                                  batch_size] + probs[-n_test_remain:]
                print len(test_sents_l), len(probs)
                assert len(test_sents_l) == len(probs)
                assert sum(group_size_list) == len(probs)
                #max prob in group
                max_probs = []
                prior_size = 0
                for i in range(len(group_size_list)):

                    sub_probs = probs[prior_size:prior_size +
                                      group_size_list[i]]
                    prior_size += group_size_list[i]
                    max_probs.append(max(sub_probs))

                assert len(test_labels) == len(max_probs)
                # test_acc=1.0-error_sum/(len(test_batch_start))
                test_ap = apk(test_labels, max_probs, k=len(test_labels))
                test_ap_top100 = apk(test_labels, max_probs, k=100)

                if test_ap > max_ap_test:
                    max_ap_test = test_ap
                    store_model_to_file(
                        '/save/wenpeng/datasets/EVALution/HyperDef_label_4ways_conc_test_on_EVA_allDefComb_best_para_'
                        + str(max_ap_test), params)
                if test_ap_top100 > max_ap_topk_test:
                    max_ap_topk_test = test_ap_top100
                print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    emb_size=40,
                    batch_size=50,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    # bow_mean_emb = bow_emb/T.sum(sents_mask,axis=1).dimshuffle(0,'x')

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]

    # NN_para = multiCNN_para+ACNN_para

    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 12, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)

    LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1)
    LR_att_input_size = hidden_size[0] + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_att_a = create_ensemble_para(
        rng, 12, LR_att_input_size)  # the weight matrix hidden_size*2
    LR_att_b = theano.shared(value=np.zeros((12, ),
                                            dtype=theano.config.floatX),
                             name='LR_b',
                             borrow=True)  #bias for each target class
    LR_att_para = [U_att_a, LR_att_b]
    layer_att_LR = LogisticRegression(
        rng,
        input=LR_att_input,
        n_in=LR_att_input_size,
        n_out=12,
        W=U_att_a,
        b=LR_att_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax)  #batch * 12
    att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix,
                           att_score_matrix)

    att_loss = -T.mean(T.log(att_prob_pos))
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    acnn_LR_input = T.concatenate(
        [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1)
    acnn_LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a = create_ensemble_para(
        rng, 12, acnn_LR_input_size)  # the weight matrix hidden_size*2
    acnn_LR_b = theano.shared(value=np.zeros((12, ),
                                             dtype=theano.config.floatX),
                              name='LR_b',
                              borrow=True)  #bias for each target class
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)

    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para  # put all model parameters together
    cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() +
                                                 (conv_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''

    ensemble_scores = T.max(T.concatenate([
        att_score_matrix.dimshuffle('x', 0, 1),
        score_matrix.dimshuffle('x', 0, 1),
        acnn_score_matrix.dimshuffle('x', 0, 1)
    ],
                                          axis=0),
                            axis=0)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_id_matrix, sents_mask, labels],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask],
                                 binarize_prob,
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    cost_i = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size])
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=0.0000001,
                    extra_size=4,
                    drop_p=0.2,
                    div_weight=0.00001,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    comment=''):

    model_options = locals().copy()
    print "model options", model_options

    first_seeds = [1234, 1235, 1236, 1237]  #first copy starts by 1
    first_rngs = [
        np.random.RandomState(first_seeds[0]),
        np.random.RandomState(first_seeds[1]),
        np.random.RandomState(first_seeds[2]),
        np.random.RandomState(first_seeds[3])
    ]  #random seed, control the model generates the same results
    first_srng = RandomStreams(first_rngs[0].randint(999999))

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_extra, all_labels, word2id, test_rows = load_SNLI_dataset_with_extra_with_test(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    #     train_sents_l = np.concatenate((train_sents_l, dev_sents_l), axis=0)
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    #     train_masks_l = np.concatenate((train_masks_l, dev_masks_l), axis=0)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    #     train_sents_r = np.concatenate((train_sents_r, dev_sents_r), axis=0)
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    #     train_masks_r = np.concatenate((train_masks_r, dev_masks_r), axis=0)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_extra = np.asarray(all_extra[0], dtype=theano.config.floatX)
    dev_extra = np.asarray(all_extra[1], dtype=theano.config.floatX)
    test_extra = np.asarray(all_extra[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    #     train_labels_store = np.concatenate((train_labels_store, dev_labels_store), axis=0)
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = first_rngs[0].normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    #     word2vec =extend_word2vec_lowercase(word2vec)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    first_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    train_flag = T.iscalar()
    first_sents_ids_l = T.imatrix()
    first_sents_mask_l = T.fmatrix()
    first_sents_ids_r = T.imatrix()
    first_sents_mask_r = T.fmatrix()
    first_labels = T.ivector()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def common_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    first_common_input_l = dropout_layer(
        first_srng, common_input(first_embeddings,
                                 first_sents_ids_l), drop_p, train_flag
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    first_common_input_r = dropout_layer(
        first_srng, common_input(first_embeddings,
                                 first_sents_ids_r), drop_p, train_flag
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    gate_filter_shape = (hidden_size[0], 1, emb_size, 1)

    def create_CNN_params(rng):
        conv_W_2_pre, conv_b_2_pre = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2_gate, conv_b_2_gate = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2, conv_b_2 = create_conv_para(rng,
                                              filter_shape=(hidden_size[1], 1,
                                                            hidden_size[0],
                                                            filter_size[0]))
        conv_W_2_context, conv_b_2_context = create_conv_para(
            rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))
        return conv_W_2_pre, conv_b_2_pre, conv_W_2_gate, conv_b_2_gate, conv_W_2, conv_b_2, conv_W_2_context, conv_b_2_context

    first_conv_W_pre, first_conv_b_pre, first_conv_W_gate, first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context, first_conv_b_context = create_CNN_params(
        first_rngs[0])

    first_CNN_1_para = [
        first_conv_W_pre, first_conv_b_pre, first_conv_W_gate,
        first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context
    ]
    '''
    first copy
    '''

    def copy(rngs, common_input_l, common_input_r, sents_mask_l, sents_mask_r,
             drop_conv_W_1_pre, conv_b_1_pre, drop_conv_W_1_gate,
             conv_b_1_gate, drop_conv_W_1, conv_b_1, drop_conv_W_1_context,
             conv_b_1_context, labels):

        loss_0_0, distr_0_0, params_0_0 = one_classifier_in_one_copy(
            rngs[0], common_input_l, common_input_r, sents_mask_l,
            sents_mask_r, batch_size, emb_size, maxSentLen, gate_filter_shape,
            hidden_size, filter_size, first_srng, drop_p, train_flag, labels,
            drop_conv_W_1_pre, conv_b_1_pre, drop_conv_W_1_gate, conv_b_1_gate,
            drop_conv_W_1, conv_b_1, drop_conv_W_1_context, conv_b_1_context,
            True)
        # loss_0_4, distr_0_4, params_0_4 = one_classifier_in_one_copy(rng, common_input_l,common_input_r,sents_mask_l,sents_mask_r,batch_size, emb_size,
        #                                                              maxSentLen,gate_filter_shape,hidden_size,filter_size,
        #                        first_srng, drop_p,train_flag,labels,
        #                        drop_conv_W_1_pre_5,conv_b_1_pre_5,drop_conv_W_1_gate_5,conv_b_1_gate_5,
        #                        drop_conv_W_1_5,conv_b_1_5,drop_conv_W_1_context_5,conv_b_1_context_5,
        #                        True)
        # loss_0_5, distr_0_5, params_0_5 = one_classifier_in_one_copy(rng, common_input_l,common_input_r,sents_mask_l,sents_mask_r,batch_size, emb_size,
        #                                                              maxSentLen,gate_filter_shape,hidden_size,filter_size,
        #                        first_srng, drop_p,train_flag,labels,
        #                        drop_conv_W_1_pre_6,conv_b_1_pre_6,drop_conv_W_1_gate_6,conv_b_1_gate_6,
        #                        drop_conv_W_1_6,conv_b_1_6,drop_conv_W_1_context_6,conv_b_1_context_6,
        #                        False)

        #         psp_label = T.repeat(labels, multi_psp_size)

        loss_0 = loss_0_0
        para_0 = params_0_0

        #         loss = loss_0+loss_1+loss_2
        batch_distr = distr_0_0  #T.sum((layer_LR.p_y_given_x).reshape((batch_size, multi_psp_size,3)), axis=1)  #(batch, 3)

        return loss_0, para_0, batch_distr

    first_loss, first_classifier_params, first_test_distr = copy(
        first_rngs, first_common_input_l, first_common_input_r,
        first_sents_mask_l, first_sents_mask_r, first_conv_W_pre,
        first_conv_b_pre, first_conv_W_gate, first_conv_b_gate, first_conv_W,
        first_conv_b, first_conv_W_context, first_conv_b_context, first_labels)

    first_preds = T.argmax(first_test_distr, axis=1)  #batch
    all_error = T.mean(T.neq(first_preds, first_labels))

    #     neg_labels = T.where( labels < 2, 2, labels-1)
    #     loss2=-T.mean(T.log(1.0/(1.0+layer_LR.p_y_given_x))[T.arange(neg_labels.shape[0]), neg_labels])

    # rank loss
    # entail_prob_batch = T.nnet.softmax(layer_LR.before_softmax.T)[2] #batch
    # entail_ids = elementwise_is_two(labels)
    # entail_probs = entail_prob_batch[entail_ids.nonzero()]
    # non_entail_probs = entail_prob_batch[(1-entail_ids).nonzero()]
    #
    # repeat_entail = T.extra_ops.repeat(entail_probs, non_entail_probs.shape[0], axis=0)
    # repeat_non_entail = T.extra_ops.repeat(non_entail_probs.dimshuffle('x',0), entail_probs.shape[0], axis=0).flatten()
    # loss2 = -T.mean(T.log(entail_probs))#T.mean(T.maximum(0.0, margin-repeat_entail+repeat_non_entail))

    # zero_matrix = T.zeros((batch_size, 3))
    # filled_zero_matrix = T.set_subtensor(zero_matrix[T.arange(batch_size), labels], 1.0)
    # prob_batch_posi = layer_LR.p_y_given_x[filled_zero_matrix.nonzero()]
    # prob_batch_nega = layer_LR.p_y_given_x[(1-filled_zero_matrix).nonzero()]
    #
    # repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0)
    # repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten()
    # loss2 = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega))

    first_common_para = [first_embeddings]
    first_classifier_1_para = first_CNN_1_para + first_classifier_params

    first_common_updates = Gradient_Cost_Para(first_loss, first_common_para,
                                              learning_rate)
    first_classifier_1_updates = Gradient_Cost_Para(first_loss,
                                                    first_classifier_1_para,
                                                    learning_rate)

    cost = first_loss

    first_updates = first_common_updates + first_classifier_1_updates

    updates = first_updates

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([
        train_flag,
        first_sents_ids_l,
        first_sents_mask_l,
        first_sents_ids_r,
        first_sents_mask_r,
        first_labels,
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    #     train_model_pred = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra,labels,
    #                                         second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], [LR_input, labels], allow_input_downcast=True, on_unused_input='ignore')
    #
    #     dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra, labels,
    #                                  second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([
        train_flag,
        first_sents_ids_l,
        first_sents_mask_l,
        first_sents_ids_r,
        first_sents_mask_r,
        first_labels,
    ], [all_error, first_preds],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    gold_test_rows = test_rows[:(n_test_batches *
                                 batch_size)] + test_rows[-batch_size:]

    max_acc_test = 0.0

    cost_i = 0.0
    first_train_indices = range(train_size)
    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(200).shuffle(
            first_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            first_train_id_batch = first_train_indices[batch_id:batch_id +
                                                       batch_size]
            cost_i += train_model(1, train_sents_l[first_train_id_batch],
                                  train_masks_l[first_train_id_batch],
                                  train_sents_r[first_train_id_batch],
                                  train_masks_r[first_train_id_batch],
                                  train_labels_store[first_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % int(2000 * (50.0 / batch_size)) == 0:
                #             if iter%int(200*(50.0 / batch_size))==0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0:
                #     print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                #     past_time = time.time()

                #                 error_sum=0.0
                #                 for dev_batch_id in dev_batch_start: # for each test batch
                #                     error_i=dev_model(
                #                                 dev_sents_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_sents_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_labels_store[dev_batch_id:dev_batch_id+batch_size]
                #                                 )
                #
                #                     error_sum+=error_i
                #                 dev_accuracy=1.0-error_sum/(len(dev_batch_start))
                #                 if dev_accuracy > max_acc_dev:
                #                     max_acc_dev=dev_accuracy
                #                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev
                #best dev model, do test
                #                 error_sum_1=0.0
                #                 error_sum_2=0.0
                error_sum_comb = 0.0
                pred_ys = []
                for test_batch_id in test_batch_start:  # for each test batch
                    error_comb, pred_ys_batch = test_model(
                        0,
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    #                     error_sum_1+=error_1
                    #                     error_sum_2+=error_2
                    error_sum_comb += error_comb
                    pred_ys += list(pred_ys_batch)

#                 test_acc_1=1.0-error_sum_1/(len(test_batch_start))
#                 test_acc_2=1.0-error_sum_2/(len(test_batch_start))
                test_acc_comb = 1.0 - error_sum_comb / (len(test_batch_start))

                #                 if test_acc_1 > max_acc_test:
                #                     max_acc_test=test_acc_1
                #                 if test_acc_2 > max_acc_test:
                #                     max_acc_test=test_acc_2
                if test_acc_comb > max_acc_test:
                    max_acc_test = test_acc_comb
                    #                     store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_single_model_'+str(max_acc_test), params)

                    if len(pred_ys) != len(gold_test_rows):
                        print 'len(pred_ys)!=len(gold_test_rows):', len(
                            pred_ys), len(gold_test_rows)
                    else:
                        test_write = open(
                            '/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/error_analysis_'
                            + str(max_acc_test) + '.txt', 'w')
                        for i in range(len(pred_ys)):
                            test_write.write(
                                str(pred_ys[i]) + '\t' + gold_test_rows[i] +
                                '\n')
                        print 'error analysis file written over.'
                        test_write.close()

                print '\t\tcurrent acc:', test_acc_comb, '\t\t\t\t\tmax_acc:', max_acc_test


#                 else:
#                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.0001,
                    n_epochs=2000,
                    nkerns=[256, 256],
                    batch_size=1,
                    window_width=[4, 4],
                    maxSentLength=64,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0006,
                    Div_reg=0.06,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40):
    maxSentLength = max_truncate + 2 * (window_width[0] - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/'
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size = load_wikiQA_corpus(
        rootPath + 'vocab.txt', rootPath + 'WikiQA-train.txt',
        rootPath + 'test_filtered.txt', max_truncate,
        maxSentLength)  #vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test = load_mts_wikiQA(
        mtPath + 'result_train/concate_2mt_train.txt',
        mtPath + 'result_test/concate_2mt_test.txt')
    wm_train, wm_test = load_wmf_wikiQA(
        rootPath + 'train_word_matching_scores.txt',
        rootPath + 'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]
    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int64')
    indices_train_r = T.cast(indices_train_r, 'int64')
    indices_test_l = T.cast(indices_test_l, 'int64')
    indices_test_r = T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')
    left_l = T.lscalar()
    right_l = T.lscalar()
    left_r = T.lscalar()
    right_r = T.lscalar()
    length_l = T.lscalar()
    length_r = T.lscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    wmf = T.dmatrix()
    cost_tmp = T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width[0])
    filter_size_2 = (nkerns[0], window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()
    layer0_r_input = embeddings[x_index_r.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()

    l_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_l_input[:, left_l:-right_l]), 'l_input_tensor')
    r_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_r_input[:, left_r:-right_r]), 'r_input_tensor')

    addition_l = T.sum(layer0_l_input[:, left_l:-right_l], axis=1)
    addition_r = T.sum(layer0_r_input[:, left_r:-right_r], axis=1)
    cosine_addition = cosine(addition_l, addition_r)
    eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r))  #25.2%

    U, W, b = create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para = [U, W, b]

    layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)
    layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)

    cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
    eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep,
                                     layer0_A2.output_sent_rep))  #25.2%

    #ibm attentive pooling at extended sentence level
    attention_matrix = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim,
        layer0_A2.dim,
        maxSentLength * (maxSentLength + 1) / 2)
    #     attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose()
    #     ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose()
    #     attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose()
    #     ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose()
    #     cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended)
    #     eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2%

    #ibm attentive pooling at original sentence level
    simi_matrix_sent = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates,
        length_l, length_r, maxSentLength)
    attention_vec_l = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=1)).transpose()
    ibm_l = layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose()
    attention_vec_r = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=0)).transpose()
    ibm_r = layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose()
    cosine_ibm = cosine(ibm_l, ibm_r)
    eucli_ibm = 1.0 / (1.0 + EUCLID(ibm_l, ibm_r))  #25.2%

    l_max_attention = T.max(attention_matrix, axis=1)
    neighborsArgSorted = T.argsort(l_max_attention)
    kNeighborsArg = neighborsArgSorted[-3:]  #only average the max 3 vectors
    ll = T.sort(kNeighborsArg).flatten()  # make y indices in acending lie

    r_max_attention = T.max(attention_matrix, axis=0)
    neighborsArgSorted_r = T.argsort(r_max_attention)
    kNeighborsArg_r = neighborsArgSorted_r[
        -3:]  #only average the max 3 vectors
    rr = T.sort(kNeighborsArg_r).flatten()  # make y indices in acending lie

    l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll],
                                      'l_max_min_attention')
    r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr],
                                      'r_max_min_attention')

    U1, W1, b1 = create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para = [U1, W1, b1]

    layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)
    layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)

    vec_l = debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])),
                        'vec_l')
    vec_r = debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])),
                        'vec_r')

    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #
    uni_cosine = cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
    eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r))  #25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #
    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    #
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))
    #     '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input = T.concatenate(
        [
            vec_l,
            vec_r,
            uni_cosine,
            eucli_1,
            cosine_addition,
            eucli_addition,
            #                                 cosine_sent, eucli_sent,
            ibm_l.reshape((1, nkerns[0])),
            ibm_r.reshape((1, nkerns[0])),  #2*nkerns[0]+
            cosine_ibm,
            eucli_ibm,
            len_l,
            len_r,
            wmf
        ],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3 = LogisticRegression(rng,
                                input=layer3_input,
                                n_in=(2 * nkerns[1] + 2) + 2 +
                                (2 * nkerns[0] + 2) + 2 + 2,
                                n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer3.W**2).sum() + (U**2).sum() + (W**2).sum() + (U1**2).sum() +
        (W1**2).sum(), 'L2_reg'
    )  #+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg = Diversify_Reg(layer3.W.T) + Diversify_Reg(
        U[0]) + Diversify_Reg(W[0]) + Diversify_Reg(U1[0]) + Diversify_Reg(
            W1[0]) + Diversify_Reg(U[1]) + Diversify_Reg(W[1]) + Diversify_Reg(
                U1[1]) + Diversify_Reg(W1[1]) + Diversify_Reg(
                    U[2]) + Diversify_Reg(W[2]) + Diversify_Reg(
                        U1[2]) + Diversify_Reg(W1[2])
    cost_this = debug_print(layer3.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print((cost_this + cost_tmp) / update_freq +
                       L2_weight * L2_reg + Div_reg * diversify_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index], [layer3.prop_for_posi, layer3_input, y],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size],
            wmf: wm_test[index:index + batch_size]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params + layer1_para + layer0_para  #+[embeddings]# + layer1.params

    #     params_conv = [conv_W, conv_b]

    #     accumulator=[]
    #     for para_i in params:
    #         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
    #         accumulator.append(theano.shared(eps_p, borrow=True))
    #
    #     # create a list of gradients for all model parameters
    #     grads = T.grad(cost, params)
    #
    #     updates = []
    #     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
    #         grad_i=debug_print(grad_i,'grad_i')
    #         acc = acc_i + T.sqr(grad_i)
    #         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
    #         updates.append((acc_i, acc))

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates

    updates = Adam(cost=cost, params=params, lr=learning_rate)

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False

    svm_max = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #             print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs = []
                test_y = []
                test_features = []
                for i in test_batch_start:
                    prob_i, layer3_input, y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR = compute_map_mrr(rootPath + 'test_filtered.txt',
                                           test_probs)
                #now, check MAP and MRR
                print(
                    ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                     'model %f, MRR  %f') %
                    (epoch, minibatch_index, n_train_batches, MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm = clf.decision_function(test_features)
                MAP_svm, MRR_svm = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_svm)

                lr = LinearRegression().fit(train_features, train_y)
                results_lr = lr.predict(test_features)
                MAP_lr, MRR_lr = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    L2_weight=0.000001,
                    drop_p=0.05,
                    emb_size=300,
                    hidden_size=500,
                    HL_hidden_size=500,
                    batch_size=5,
                    filter_size=[3, 5, 7],
                    maxSentLen=180,
                    comment=''):

    model_options = locals().copy()
    print "model options", model_options

    rng = np.random.RandomState(
        1234)  #random seed, control the model generates the same results
    srng = RandomStreams(rng.randint(999999))
    all_sentences, all_masks, all_labels, word2id = load_yelp_dataset(
        maxlen=maxSentLen, minlen=2
    )  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec_file('glove.840B.300d.txt')  #
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.ivector('labels')
    train_flag = T.iscalar()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM

    #     drop_common_input = dropout_layer(srng, common_input, drop_p, train_flag)

    bow = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1),
                axis=2)  #(batch, emb_size)

    gate_filter_shape = (emb_size, 1, emb_size, 1)
    conv_W_2_pre, conv_b_2_pre = create_conv_para(
        rng, filter_shape=gate_filter_shape)
    conv_W_2_gate, conv_b_2_gate = create_conv_para(
        rng, filter_shape=gate_filter_shape)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size, 1, emb_size,
                                                    filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size, 1, emb_size, 1))

    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size, 1, emb_size,
                                                      filter_size[1]))
    conv_W2_context, conv_b2_context = create_conv_para(
        rng, filter_shape=(hidden_size, 1, emb_size, 1))

    #     conv_W3, conv_b3=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[2]))
    #     conv_W3_context, conv_b3_context=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 1))
    # conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
    soft_att_W_big, soft_att_b_big = create_HiddenLayer_para(
        rng, emb_size * 2, emb_size)
    soft_att_W_small, _ = create_HiddenLayer_para(rng, emb_size, 1)
    soft_att_W2_big, soft_att_b2_big = create_HiddenLayer_para(
        rng, emb_size * 2, emb_size)
    soft_att_W2_small, _ = create_HiddenLayer_para(rng, emb_size, 1)

    #     soft_att_W3_big, soft_att_b3_big = create_HiddenLayer_para(rng, emb_size*2, emb_size)
    #     soft_att_W3_small, _ = create_HiddenLayer_para(rng, emb_size, 1)

    NN_para = [
        conv_W_2_pre,
        conv_b_2_pre,
        conv_W_2_gate,
        conv_b_2_gate,
        conv_W,
        conv_b,
        conv_W_context,
        conv_W2,
        conv_b2,
        conv_W2_context,
        #              conv_W3, conv_b3,conv_W3_context,
        soft_att_W_big,
        soft_att_b_big,
        soft_att_W_small,
        soft_att_W2_big,
        soft_att_b2_big,
        soft_att_W2_small
        #              soft_att_W3_big, soft_att_b3_big,soft_att_W3_small
    ]  #,conv_W3, conv_b3,conv_W3_context]

    conv_layer_1_gate_l = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_2_pre,
        b=conv_b_2_pre,
        W_gate=conv_W_2_gate,
        b_gate=conv_b_2_gate)

    advanced_sent_tensor3 = conv_layer_1_gate_l.output_tensor3

    #     conv_layer_pair = Conv_for_Pair(rng,
    #             origin_input_tensor3=advanced_sent_tensor3,
    #             origin_input_tensor3_r = advanced_sent_tensor3,
    #             input_tensor3=advanced_sent_tensor3,
    #             input_tensor3_r = advanced_sent_tensor3,
    #              mask_matrix = sents_mask,
    #              mask_matrix_r = sents_mask,
    #              image_shape=(batch_size, 1, emb_size, maxSentLen),
    #              image_shape_r = (batch_size, 1, emb_size, maxSentLen),
    #              filter_shape=(hidden_size, 1, emb_size, filter_size[0]),
    #              filter_shape_context=(hidden_size, 1, emb_size, 1),
    #              W=conv_W, b=conv_b,
    #              W_context=conv_W_context, b_context=conv_b_context)

    conv_layer_pair = Conv_for_Pair_SoftAttend(
        rng,
        origin_input_tensor3=advanced_sent_tensor3,
        origin_input_tensor3_r=advanced_sent_tensor3,
        input_tensor3=advanced_sent_tensor3,
        input_tensor3_r=advanced_sent_tensor3,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        filter_shape=(hidden_size, 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size, 1, emb_size, 1),
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context,
        soft_att_W_big=soft_att_W_big,
        soft_att_b_big=soft_att_b_big,
        soft_att_W_small=soft_att_W_small)

    #     conv_layer_2_pair = Conv_for_Pair(rng,
    #             origin_input_tensor3=advanced_sent_tensor3,
    #             origin_input_tensor3_r = advanced_sent_tensor3,
    #             input_tensor3=advanced_sent_tensor3,
    #             input_tensor3_r = advanced_sent_tensor3,
    #              mask_matrix = sents_mask,
    #              mask_matrix_r = sents_mask,
    #              image_shape=(batch_size, 1, emb_size, maxSentLen),
    #              image_shape_r = (batch_size, 1, emb_size, maxSentLen),
    #              filter_shape=(hidden_size, 1, emb_size, filter_size[1]),
    #              filter_shape_context=(hidden_size, 1, emb_size, 1),
    #              W=conv_W2, b=conv_b2,
    #              W_context=conv_W2_context, b_context=conv_b2_context)
    conv_layer_2_pair = Conv_for_Pair_SoftAttend(
        rng,
        origin_input_tensor3=advanced_sent_tensor3,
        origin_input_tensor3_r=advanced_sent_tensor3,
        input_tensor3=advanced_sent_tensor3,
        input_tensor3_r=advanced_sent_tensor3,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        filter_shape=(hidden_size, 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size, 1, emb_size, 1),
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        W=conv_W2,
        b=conv_b2,
        W_context=conv_W2_context,
        b_context=conv_b2_context,
        soft_att_W_big=soft_att_W2_big,
        soft_att_b_big=soft_att_b2_big,
        soft_att_W_small=soft_att_W2_small)

    #     conv_layer_3_pair = Conv_for_Pair_SoftAttend(rng,
    #                                                origin_input_tensor3=advanced_sent_tensor3,
    #                                                origin_input_tensor3_r=advanced_sent_tensor3,
    #                                                input_tensor3=advanced_sent_tensor3,
    #                                                input_tensor3_r=advanced_sent_tensor3,
    #                                                mask_matrix=sents_mask,
    #                                                mask_matrix_r=sents_mask,
    #                                                filter_shape=(hidden_size, 1, emb_size, filter_size[2]),
    #                                                filter_shape_context=(hidden_size, 1, emb_size, 1),
    #                                                image_shape=(batch_size, 1, emb_size, maxSentLen),
    #                                                image_shape_r= (batch_size, 1, emb_size, maxSentLen),
    #                                                W=conv_W3, b=conv_b3,
    #                                                W_context=conv_W3_context, b_context=conv_b3_context,
    #                                                soft_att_W_big=soft_att_W3_big, soft_att_b_big=soft_att_b3_big,
    #                                                soft_att_W_small=soft_att_W3_small)

    # biased_sent_embeddings = conv_layer_pair.biased_attentive_maxpool_vec_l
    sent_embeddings = conv_layer_pair.maxpool_vec_l
    att_sent_embeddings = conv_layer_pair.attentive_maxpool_vec_l

    sent_embeddings_2 = conv_layer_2_pair.maxpool_vec_l
    att_sent_embeddings_2 = conv_layer_2_pair.attentive_maxpool_vec_l

    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    HL_input = T.concatenate(
        [
            bow, sent_embeddings, att_sent_embeddings, sent_embeddings_2,
            att_sent_embeddings_2
            #                               sent_embeddings_3,att_sent_embeddings_3,
        ],
        axis=1)
    HL_input_size = hidden_size * 4 + emb_size

    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, HL_input_size, HL_hidden_size)
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=HL_input,
                             n_in=HL_input_size,
                             n_out=HL_hidden_size,
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.nnet.relu)
    #     HL_layer_1_output = dropout_layer(srng, HL_layer_1.output, drop_p, train_flag)

    HL_layer_2_W, HL_layer_2_b = create_HiddenLayer_para(
        rng, HL_hidden_size, HL_hidden_size)
    HL_layer_2_params = [HL_layer_2_W, HL_layer_2_b]
    HL_layer_2 = HiddenLayer(rng,
                             input=HL_layer_1.output,
                             n_in=HL_hidden_size,
                             n_out=HL_hidden_size,
                             W=HL_layer_2_W,
                             b=HL_layer_2_b,
                             activation=T.nnet.relu)
    #     HL_layer_2_output = dropout_layer(srng, HL_layer_2.output, drop_p, train_flag)

    LR_input = T.concatenate([HL_input, HL_layer_1.output, HL_layer_2.output],
                             axis=1)
    #     drop_LR_input = dropout_layer(srng, LR_input, drop_p, train_flag)
    LR_input_size = HL_input_size + 2 * HL_hidden_size

    U_a = create_ensemble_para(
        rng, 5, LR_input_size)  # the weight matrix hidden_size*2
    #     norm_W_a = normalize_matrix(U_a)
    LR_b = theano.shared(value=np.zeros((5, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=5, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [
        embeddings
    ] + NN_para + HL_layer_1_params + HL_layer_2_params + LR_para  # put all model parameters together
    L2_reg = L2norm_paraList([
        embeddings, conv_W_2_pre, conv_W_2_gate, conv_W, conv_W_context,
        conv_W2, conv_W2_context, soft_att_W_big, soft_att_W_small,
        soft_att_W2_big, soft_att_W2_small, HL_layer_1_W, HL_layer_2_W, U_a
    ])
    #     diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix)

    cost = loss  #+L2_weight*L2_reg

    grads = T.grad(
        cost, params)  # create a list of gradients for all model parameters

    accumulator = []
    for para_i in params:
        eps_p = np.zeros_like(para_i.get_value(borrow=True),
                              dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i, param_i - learning_rate * grad_i /
             (T.sqrt(acc) + 1e-8)))  #1e-8 is add to get rid of zero division
        updates.append((acc_i, acc))

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function(
        [sents_id_matrix, sents_mask, labels, train_flag],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_id_matrix, sents_mask, labels, train_flag],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, labels, train_flag],
        [layer_LR.errors(labels), layer_LR.y_pred],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    cost_i = 0.0
    train_indices = range(train_size)
    while epoch < n_epochs:
        epoch = epoch + 1
        # combined = zip(train_sents, train_masks, train_labels)
        random.Random(200).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch], 1)

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 2000 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                # writefile=open('log.'+nn+'.senti.preditions.txt', 'w')
                for test_batch_id in test_batch_start:  # for each test batch
                    error_i, pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        test_labels[test_batch_id:test_batch_id + batch_size],
                        0)
                    # pred_labels=list(pred_labels)
                    # if test_batch_id !=test_batch_start[-1]:
                    #     writefile.write('\n'.join(map(str,pred_labels))+'\n')
                    # else:
                    #     writefile.write('\n'.join(map(str,pred_labels[-test_size%batch_size:])))

                    error_sum += error_i
                # writefile.close()
                test_accuracy = 1.0 - error_sum / (len(test_batch_start))
                if test_accuracy > max_acc_test:
                    max_acc_test = test_accuracy
                print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test