Example #1
0
def evaluate_lenet5(file_name,
                    vocab_file,
                    train_file,
                    dev_file,
                    word2vec_file,
                    learning_rate=0.001,
                    n_epochs=2000,
                    nkerns=[90, 90],
                    batch_size=1,
                    window_width=2,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=128,
                    max_d_length=128,
                    margin=0.3):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    f = open(file_name, 'w')
    f.write("model options " + str(model_options) + '\n')
    rng = numpy.random.RandomState(23455)
    train_data, _train_Label, train_size, test_data, _test_Label, test_size, vocab_size = load_MCTest_corpus_DPN(
        vocab_file, train_file, dev_file, max_s_length, maxSentLength,
        maxDocLength)  #vocab_size contain train, dev and test
    f.write('train_size : ' + str(train_size))

    [
        train_data_D, train_data_A1, train_Label, train_Length_D,
        train_Length_D_s, train_Length_A1, train_leftPad_D, train_leftPad_D_s,
        train_leftPad_A1, train_rightPad_D, train_rightPad_D_s,
        train_rightPad_A1
    ] = train_data
    [
        test_data_D, test_data_A1, test_Label, test_Length_D, test_Length_D_s,
        test_Length_A1, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_A1
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    rand_values = load_word2vec_to_init(rand_values, word2vec_file)
    embeddings = theano.shared(value=rand_values, borrow=True)

    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    index_A1 = T.lvector()
    y = T.lscalar()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    len_A1 = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    left_A1 = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    right_A1 = T.lscalar()

    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    f.write('... building the model\n')

    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    layer0_para = [conv_W, conv_b]
    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    layer2_para = [conv2_W, conv2_b]
    high_W, high_b = create_highw_para(
        rng, nkerns[0], nkerns[1]
    )  # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para = [high_W, high_b]
    params = layer2_para + layer0_para + highW_para  #+[embeddings]

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A1 = Conv_with_input_para(
        rng,
        input=layer0_A1_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output')

    layer1_DA1 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A1_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A1,
                                      right_r=right_A1,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A1 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)

    layer2_DA1 = Conv_with_input_para(
        rng,
        input=layer1_DA1.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA1.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A1_output_sent_rep_Dlevel = debug_print(
        layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')

    layer3_DA1 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA1.output,
        input_r=layer2_A1_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)

    #high-way

    transform_gate_DA1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b),
        'transform_gate_DA1')
    transform_gate_A1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b),
        'transform_gate_A1')

    overall_D_A1 = (
        1.0 - transform_gate_DA1
    ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep
    overall_A1 = (
        1.0 - transform_gate_A1
    ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel

    simi_sent_level1 = debug_print(
        cosine(layer1_DA1.output_D_sent_level_rep,
               layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')

    simi_doc_level1 = debug_print(
        cosine(layer3_DA1.output_D_doc_level_rep,
               layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')

    simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1),
                                      'simi_overall_level1')

    simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0
    logistic_w, logistic_b = create_logistic_para(rng, 1, 2)
    logistic_para = [logistic_w, logistic_b]
    params += logistic_para
    simi_1 = T.dot(logistic_w, simi_1) + logistic_b.dimshuffle(0, 'x')
    simi_1 = simi_1.dimshuffle(1, 0)

    simi_1 = T.nnet.softmax(simi_1)
    predict = T.argmax(simi_1, axis=1)
    tmp = T.log(simi_1)
    cost = T.maximum(0.0, margin + tmp[0][1 - y] - tmp[0][y])
    L2_reg = (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum() + (
        logistic_w**2).sum()
    cost = cost + L2_weight * L2_reg

    test_model = theano.function(
        [index],
        [cost, simi_1, predict],
        givens={
            index_D: test_data_D[index],  #a matrix
            index_A1: test_data_A1[index],
            y: test_Label[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_A1: test_Length_A1[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_A1: test_leftPad_A1[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_A1: test_rightPad_A1[index],
        },
        on_unused_input='ignore')

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index], [cost, simi_1, predict],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            index_A1: train_data_A1[index],
            y: train_Label[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_A1: train_Length_A1[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_A1: train_leftPad_A1[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_A1: train_rightPad_A1[index],
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    f.write('... training\n')
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data

        simi_train = []
        predict_train = []
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            minibatch_index = minibatch_index + 1

            cost_average, simi, predict = train_model(batch_start)
            simi_train.append(simi)
            predict_train.append(predict)
            if iter % 1000 == 0:
                f.write('@iter :' + str(iter) + '\n')
            if iter % n_train_batches == 0:
                corr_train = compute_corr_train(predict_train, _train_Label)
                res = 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + 'corr rate: ' + str(
                            corr_train * 100.0 / train_size) + '\n'
                f.write(res)

            if iter % validation_frequency == 0 or iter % 20000 == 0:
                posi_test_sent = []
                nega_test_sent = []
                posi_test_doc = []
                nega_test_doc = []
                posi_test_overall = []
                nega_test_overall = []

                simi_test = []
                predict_test = []
                for i in test_batch_start:
                    cost, simi, predict = test_model(i)
                    #print simi
                    #f.write('test_predict : ' + str(predict) + ' test_simi : ' + str(simi) + '\n' )
                    simi_test.append(simi)
                    predict_test.append(predict)
                corr_test = compute_corr(simi_test, predict_test, f)
                test_acc = corr_test * 1.0 / (test_size / 4.0)
                res = '\t\t\tepoch ' + str(epoch) + ', minibatch ' + str(
                    minibatch_index) + ' / ' + str(
                        n_train_batches) + ' test acc of best model ' + str(
                            test_acc * 100.0) + '\n'
                f.write(res)

                find_better = False
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                res = '\t\t\tmax: ' + str(max_acc) + ' (at ' + str(
                    best_epoch) + ')\n'
                f.write(res)
                if find_better == True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[1,1], batch_size=1, window_width=3,
                    maxSentLength=60, emb_size=300, L2_weight=0.0005, update_freq=1, unifiedWidth_conv0=8, k_dy=3, ktop=3):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_msr_corpus(rootPath+'vocab.txt', rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    #mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size))
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    

    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    #mts=T.dmatrix()
    #wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv0=ishape[1]+filter_size[1]-1
    poolsize1=(1, length_after_wideConv0)
    length_after_wideConv1=unifiedWidth_conv0+filter_size[1]-1
    poolsize2=(1, length_after_wideConv1)
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_l, right=right_l, 
                        W=conv_W, b=conv_b,
                        firstLayer=True)
    layer0_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_r, right=right_r, 
                        W=conv_W, b=conv_b,
                        firstLayer=True)

    layer0_l_output=debug_print(layer0_ll.fold_output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_rr.fold_output, 'layer0_r.output')
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=ishape[0]/2,
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)

    conv_W2, conv_b2=create_conv_para(rng, filter_shape=(1, 1, filter_size[0]/2, filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer1_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_ll.output,
            image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0),
            filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_ll.leftPad, right=layer0_ll.rightPad, 
                        W=conv_W2, b=conv_b2,
                        firstLayer=False)
    layer1_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_rr.output,
            image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0),
            filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_rr.leftPad, right=layer0_rr.rightPad, 
                        W=conv_W2, b=conv_b2,
                        firstLayer=False)

    layer1_l_output=debug_print(layer1_ll.fold_output, 'layer1_l.output')
    layer1_r_output=debug_print(layer1_rr.fold_output, 'layer1_r.output')
    
    layer2=Average_Pooling_for_Top(rng, input_l=layer1_l_output, input_r=layer1_r_output, kern=ishape[0]/4,
                                       left_l=layer0_ll.leftPad, right_l=layer0_ll.rightPad, left_r=layer0_rr.leftPad, right_r=layer0_rr.rightPad, 
                                       length_l=k_dy+filter_size[1]-1, length_r=k_dy+filter_size[1]-1,
                                       dim=unifiedWidth_conv0+filter_size[1]-1)    

    
    
    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    
    
    sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([#mts, 
                                eucli_1, uni_cosine,
                                #norm_uni_l, norm_uni_r,#uni_cosine,#norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                
                                layer1.output_eucli_to_simi,layer1.output_cosine,
                                layer1.output_attentions, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                #layer1.output_vector_l,layer1.output_vector_r,
                                
                                layer2.output_eucli_to_simi,layer2.output_cosine,
                                layer2.output_attentions,
                                #layer2.output_vector_l,layer2.output_vector_r,
                                
                                len_l, len_r
                                #layer1.output_attentions,
                                #wmf,
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=(2)+(2+4*4)+(2+4*4)+2, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum()+(conv_W2**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y), layer3.y_pred, layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index]
            #mts: mt_test[index: index + batch_size],
            #wmf: wm_test[index: index + batch_size]
            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ [conv_W]+[conv_W2]# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #grad_i=debug_print(grad_i,'grad_i')
        #norm=T.sqrt((grad_i**2).sum())
        #if T.lt(norm_threshold, norm):
        #    print 'big norm'
        #    grad_i=grad_i*(norm_threshold/norm)
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([index,cost_tmp], [cost,layer3.errors(y), layer3_input], updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index]
            #mts: mt_train[index: index + batch_size],
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index]
            #mts: mt_train[index: index + batch_size],
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'cost_ij: ', cost_ij
                cost_tmp+=cost_ij
                error_sum+=error_ij
            else:
                cost_average, error_ij, layer3_input= train_model(batch_start,cost_tmp)
                #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq)
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print layer3_input
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, pred_y, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                
                test_score = numpy.mean(test_losses)
                test_acc=1-test_score
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           (1-test_score) * 100.))
                #now, see the results of svm
                #write_feature=open('feature_check.txt', 'w')
                train_y=[]
                train_features=[]
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n')
                #write_feature.close()

                clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                lr=linear_model.LogisticRegression().fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                corr_count=0
                corr_lr=0
                test_size=len(test_y)
                for i in range(test_size):
                    if results[i]==test_y[i]:
                        corr_count+=1
                    if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                        corr_lr+=1
                acc=corr_count*1.0/test_size
                acc_lr=corr_lr*1.0/test_size
                if acc > max_acc:
                    max_acc=acc
                    best_epoch=epoch
                if acc_lr> max_acc:
                    max_acc=acc_lr
                    best_epoch=epoch
                if test_acc> max_acc:
                    max_acc=test_acc
                    best_epoch=epoch
                print '\t\t\t\t\t\t\t\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ',    max_acc , ' at epoch: ', best_epoch     
                #exit(0)
            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    nkerns=[90, 90],
                    batch_size=1,
                    window_width=2,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=57,
                    max_d_length=59,
                    margin=0.2):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/'
    rng = numpy.random.RandomState(23455)
    train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DSSSS(
        rootPath + 'vocab_DSSSS.txt', rootPath +
        'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt',
        rootPath + 'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt',
        max_s_length, maxSentLength,
        maxDocLength)  #vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    #     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    #     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    #     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

    # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label),
    #          numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4),
    #         numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4),
    #         numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)]
    # return results, line_control
    [
        train_data_D, train_data_A1, train_data_A2, train_data_A3,
        train_data_A4, train_Label, train_Length_D, train_Length_D_s,
        train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4,
        train_leftPad_D, train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2,
        train_leftPad_A3, train_leftPad_A4, train_rightPad_D,
        train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2,
        train_rightPad_A3, train_rightPad_A4
    ] = train_data
    [
        test_data_D, test_data_A1, test_data_A2, test_data_A3, test_data_A4,
        test_Label, test_Length_D, test_Length_D_s, test_Length_A1,
        test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D,
        test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3,
        test_leftPad_A4, test_rightPad_D, test_rightPad_D_s, test_rightPad_A1,
        test_rightPad_A2, test_rightPad_A3, test_rightPad_A4
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    #     index_Q = T.lvector()
    index_A1 = T.lvector()
    index_A2 = T.lvector()
    index_A3 = T.lvector()
    index_A4 = T.lvector()
    #     y = T.lvector()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    #     len_Q=T.lscalar()
    len_A1 = T.lscalar()
    len_A2 = T.lscalar()
    len_A3 = T.lscalar()
    len_A4 = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    #     left_Q=T.lscalar()
    left_A1 = T.lscalar()
    left_A2 = T.lscalar()
    left_A3 = T.lscalar()
    left_A4 = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    #     right_Q=T.lscalar()
    right_A1 = T.lscalar()
    right_A2 = T.lscalar()
    right_A3 = T.lscalar()
    right_A4 = T.lscalar()

    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    #     layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A4_input = embeddings[index_A4.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    layer0_para = [conv_W, conv_b]
    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    layer2_para = [conv2_W, conv2_b]
    high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1])
    highW_para = [high_W, high_b]
    params = layer2_para + layer0_para + highW_para  #+[embeddings]
    #load_model(params)

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    #     layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A1 = Conv_with_input_para(
        rng,
        input=layer0_A1_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A2 = Conv_with_input_para(
        rng,
        input=layer0_A2_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A3 = Conv_with_input_para(
        rng,
        input=layer0_A3_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A4 = Conv_with_input_para(
        rng,
        input=layer0_A4_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    #     layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output')
    layer0_A2_output = debug_print(layer0_A2.output, 'layer0_A2.output')
    layer0_A3_output = debug_print(layer0_A3.output, 'layer0_A3.output')
    layer0_A4_output = debug_print(layer0_A4.output, 'layer0_A4.output')

    #     layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA1 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A1_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A1,
                                      right_r=right_A1,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A1 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)
    layer1_DA2 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A2_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A2,
                                      right_r=right_A2,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A2 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)
    layer1_DA3 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A3_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A3,
                                      right_r=right_A3,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A3 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)
    layer1_DA4 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A4_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A4,
                                      right_r=right_A4,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A4 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)

    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    #     layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA1 = Conv_with_input_para(
        rng,
        input=layer1_DA1.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA2 = Conv_with_input_para(
        rng,
        input=layer1_DA2.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA3 = Conv_with_input_para(
        rng,
        input=layer1_DA3.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA4 = Conv_with_input_para(
        rng,
        input=layer1_DA4.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    #     layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA1.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A2 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA2.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A3 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA3.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A4 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA4.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #     layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A1_output_sent_rep_Dlevel = debug_print(
        layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    layer2_A2_output_sent_rep_Dlevel = debug_print(
        layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    layer2_A3_output_sent_rep_Dlevel = debug_print(
        layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')
    layer2_A4_output_sent_rep_Dlevel = debug_print(
        layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel')

    #     layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA1 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA1.output,
        input_r=layer2_A1_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA2 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA2.output,
        input_r=layer2_A2_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA3 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA3.output,
        input_r=layer2_A3_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA4 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA4.output,
        input_r=layer2_A4_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)

    #high-way

    #     transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b),
        'transform_gate_DA1')
    transform_gate_DA2 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b),
        'transform_gate_DA2')
    transform_gate_DA3 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b),
        'transform_gate_DA3')
    transform_gate_DA4 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b),
        'transform_gate_DA4')
    #     transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q')
    transform_gate_A1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b),
        'transform_gate_A1')
    transform_gate_A2 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b),
        'transform_gate_A2')
    transform_gate_A3 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b),
        'transform_gate_A3')
    transform_gate_A4 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b),
        'transform_gate_A4')

    #     overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A1 = (
        1.0 - transform_gate_DA1
    ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep
    overall_D_A2 = (
        1.0 - transform_gate_DA2
    ) * layer1_DA2.output_D_sent_level_rep + transform_gate_DA2 * layer3_DA2.output_D_doc_level_rep
    overall_D_A3 = (
        1.0 - transform_gate_DA3
    ) * layer1_DA3.output_D_sent_level_rep + transform_gate_DA3 * layer3_DA3.output_D_doc_level_rep
    overall_D_A4 = (
        1.0 - transform_gate_DA4
    ) * layer1_DA4.output_D_sent_level_rep + transform_gate_DA4 * layer3_DA4.output_D_doc_level_rep

    #     overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A1 = (
        1.0 - transform_gate_A1
    ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel
    overall_A2 = (
        1.0 - transform_gate_A2
    ) * layer1_DA2.output_QA_sent_level_rep + transform_gate_A2 * layer2_A2.output_sent_rep_Dlevel
    overall_A3 = (
        1.0 - transform_gate_A3
    ) * layer1_DA3.output_QA_sent_level_rep + transform_gate_A3 * layer2_A3.output_sent_rep_Dlevel
    overall_A4 = (
        1.0 - transform_gate_A4
    ) * layer1_DA4.output_QA_sent_level_rep + transform_gate_A4 * layer2_A4.output_sent_rep_Dlevel

    simi_sent_level1 = debug_print(
        cosine(layer1_DA1.output_D_sent_level_rep,
               layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')
    simi_sent_level2 = debug_print(
        cosine(layer1_DA2.output_D_sent_level_rep,
               layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2')
    simi_sent_level3 = debug_print(
        cosine(layer1_DA3.output_D_sent_level_rep,
               layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3')
    simi_sent_level4 = debug_print(
        cosine(layer1_DA4.output_D_sent_level_rep,
               layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4')

    simi_doc_level1 = debug_print(
        cosine(layer3_DA1.output_D_doc_level_rep,
               layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')
    simi_doc_level2 = debug_print(
        cosine(layer3_DA2.output_D_doc_level_rep,
               layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2')
    simi_doc_level3 = debug_print(
        cosine(layer3_DA3.output_D_doc_level_rep,
               layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3')
    simi_doc_level4 = debug_print(
        cosine(layer3_DA4.output_D_doc_level_rep,
               layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4')

    simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1),
                                      'simi_overall_level1')
    simi_overall_level2 = debug_print(cosine(overall_D_A2, overall_A2),
                                      'simi_overall_level2')
    simi_overall_level3 = debug_print(cosine(overall_D_A3, overall_A3),
                                      'simi_overall_level3')
    simi_overall_level4 = debug_print(cosine(overall_D_A4, overall_A4),
                                      'simi_overall_level4')

    simi_1 = simi_overall_level1  #+simi_sent_level1+simi_doc_level1
    simi_2 = simi_overall_level2  #+simi_sent_level2+simi_doc_level2
    simi_3 = simi_overall_level3  #+simi_sent_level3+simi_doc_level3
    simi_4 = simi_overall_level4  #+simi_sent_level4+simi_doc_level4
    #     simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0
    #     simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0
    #     simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0
    #     simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0

    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    #     #only use overall_simi
    #     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    #     posi_simi=simi_overall_level1
    #     nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #use ensembled simi
    #     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
    #     cost=T.maximum(0.0, margin+simi_2-simi_1)+T.maximum(0.0, margin+simi_3-simi_1)+T.maximum(0.0, margin+simi_4-simi_1)
    cost12 = T.maximum(
        0.0, margin + simi_sent_level2 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level2 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level2 - simi_overall_level1)
    cost13 = T.maximum(
        0.0, margin + simi_sent_level3 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level3 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level3 - simi_overall_level1)
    cost14 = T.maximum(
        0.0, margin + simi_sent_level4 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level4 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level4 - simi_overall_level1)
    cost = cost12 + cost13 + cost14
    posi_simi = T.max([simi_sent_level1, simi_doc_level1, simi_overall_level1])
    nega_simi = T.max([
        simi_sent_level2, simi_doc_level2, simi_overall_level2,
        simi_sent_level3, simi_doc_level3, simi_overall_level3,
        simi_sent_level4, simi_doc_level4, simi_overall_level4
    ])

    L2_reg = debug_print(
        (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(), 'L2_reg'
    )  #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost = debug_print(cost + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        givens={
            index_D: test_data_D[index],  #a matrix
            #             index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
            index_A4: test_data_A4[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            #             len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
            len_A4: test_Length_A4[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            #             left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
            left_A4: test_leftPad_A4[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            #             right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index],
            right_A4: test_rightPad_A4[index]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))


#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         acc = acc_i + T.sqr(grad_i)
#         if param_i == embeddings:
#             updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size)))))   #AdaGrad
#         else:
#             updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))

    train_model = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data

        corr_train = 0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            sys.stdout.write("Training :[%6f] %% complete!\r" %
                             ((iter % train_size) * 100.0 / train_size))
            sys.stdout.flush()
            minibatch_index = minibatch_index + 1

            cost_average, posi_simi, nega_simi = train_model(batch_start)
            if posi_simi > nega_simi:
                corr_train += 1

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + 'corr rate:' + str(
                            corr_train * 100.0 / train_size)

            if iter % validation_frequency == 0:
                corr_test = 0
                for i in test_batch_start:
                    cost, posi_simi, nega_simi = test_model(i)
                    if posi_simi > nega_simi:
                        corr_test += 1

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc = corr_test * 1.0 / test_size
                #test_acc=1-test_score
                print(
                    ('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') %
                    (epoch, minibatch_index, n_train_batches, test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                find_better = False
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')'
                if find_better == True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.06,
                    n_epochs=2000,
                    nkerns=[50, 50],
                    batch_size=1,
                    window_width=[4, 4],
                    maxSentLength=64,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0006,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40):
    maxSentLength = max_truncate + 2 * (window_width[0] - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/'
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size = load_wikiQA_corpus(
        rootPath + 'vocab.txt', rootPath + 'WikiQA-train.txt',
        rootPath + 'test_filtered.txt', max_truncate,
        maxSentLength)  #vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test = load_mts_wikiQA(
        mtPath + 'result_train/concate_2mt_train.txt',
        mtPath + 'result_test/concate_2mt_test.txt')
    wm_train, wm_test = load_wmf_wikiQA(
        rootPath + 'train_word_matching_scores.txt',
        rootPath + 'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]
    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int64')
    indices_train_r = T.cast(indices_train_r, 'int64')
    indices_test_l = T.cast(indices_test_l, 'int64')
    indices_test_r = T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')
    left_l = T.lscalar()
    right_l = T.lscalar()
    left_r = T.lscalar()
    right_r = T.lscalar()
    length_l = T.lscalar()
    length_r = T.lscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    wmf = T.dmatrix()
    cost_tmp = T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width[0])
    filter_size_2 = (nkerns[0], window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_size[0],
                                                    filter_size[1]))
    load_model_from_file([conv_W, conv_b])
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng,
                                    input=layer0_l_input,
                                    image_shape=(batch_size, 1, ishape[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[0], 1, filter_size[0],
                                                  filter_size[1]),
                                    W=conv_W,
                                    b=conv_b)
    layer0_r = Conv_with_input_para(rng,
                                    input=layer0_r_input,
                                    image_shape=(batch_size, 1, ishape[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[0], 1, filter_size[0],
                                                  filter_size[1]),
                                    W=conv_W,
                                    b=conv_b)
    layer0_l_output = debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output = debug_print(layer0_r.output, 'layer0_r.output')
    layer0_para = [conv_W, conv_b]

    layer1 = Average_Pooling(rng,
                             input_l=layer0_l_output,
                             input_r=layer0_r_output,
                             kern=nkerns[0],
                             left_l=left_l,
                             right_l=right_l,
                             left_r=left_r,
                             right_r=right_r,
                             length_l=length_l + filter_size[1] - 1,
                             length_r=length_r + filter_size[1] - 1,
                             dim=maxSentLength + filter_size[1] - 1,
                             window_size=window_width[0],
                             maxSentLength=maxSentLength)

    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1,
                                                      filter_size_2[0],
                                                      filter_size_2[1]))
    #load_model_from_file([conv2_W, conv2_b])
    layer2_l = Conv_with_input_para(
        rng,
        input=layer1.output_tensor_l,
        image_shape=(batch_size, 1, nkerns[0], ishape[1]),
        filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_r = Conv_with_input_para(
        rng,
        input=layer1.output_tensor_r,
        image_shape=(batch_size, 1, nkerns[0], ishape[1]),
        filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_para = [conv2_W, conv2_b]

    layer3 = Average_Pooling_for_Top(rng,
                                     input_l=layer2_l.output,
                                     input_r=layer2_r.output,
                                     kern=nkerns[1],
                                     left_l=left_l,
                                     right_l=right_l,
                                     left_r=left_r,
                                     right_r=right_r,
                                     length_l=length_l + filter_size_2[1] - 1,
                                     length_r=length_r + filter_size_2[1] - 1,
                                     dim=maxSentLength + filter_size_2[1] - 1)

    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)

    sum_uni_l = T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    aver_uni_l = sum_uni_l / layer0_l_input.shape[3]
    norm_uni_l = sum_uni_l / T.sqrt((sum_uni_l**2).sum())
    sum_uni_r = T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    aver_uni_r = sum_uni_r / layer0_r_input.shape[3]
    norm_uni_r = sum_uni_r / T.sqrt((sum_uni_r**2).sum())

    uni_cosine = cosine(sum_uni_l, sum_uni_r)
    aver_uni_cosine = cosine(aver_uni_l, aver_uni_r)
    uni_sigmoid_simi = debug_print(
        T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1, 1)),
        'uni_sigmoid_simi')
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1 = 1.0 / (1.0 + EUCLID(sum_uni_l, sum_uni_r))  #25.2%
    #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))

    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input = T.concatenate(
        [  #mts,
            uni_cosine,  #eucli_1_exp,#uni_sigmoid_simi,  #norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
            layer1.
            output_cosine,  #layer1.output_eucli_to_simi_exp,#layer1.output_sigmoid_simi,#layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
            layer3.output_cosine,
            len_l,
            len_r,
            wmf
        ],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3 = LogisticRegression(rng,
                                input=layer3_input,
                                n_in=(1) + (1) + (1) + 2 + 2,
                                n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer3.W**2).sum() + (conv2_W**2).sum(), 'L2_reg'
    )  #+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this = debug_print(layer3.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index], [layer3.prop_for_posi, layer3_input, y],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size],
            wmf: wm_test[index:index + batch_size]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params + layer2_para  #+layer0_para

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches / 5, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    svm_max = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs = []
                test_y = []
                test_features = []
                for i in test_batch_start:
                    prob_i, layer3_input, y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR = compute_map_mrr(rootPath + 'test_filtered.txt',
                                           test_probs)
                #now, check MAP and MRR
                print(
                    ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                     'model %f, MRR  %f') %
                    (epoch, minibatch_index, n_train_batches, MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm = clf.decision_function(test_features)
                MAP_svm, MRR_svm = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_svm)

                lr = LinearRegression().fit(train_features, train_y)
                results_lr = lr.predict(test_features)
                MAP_lr, MRR_lr = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        #after each epoch, increase the batch_size
        if epoch % 2 == 1:
            update_freq = update_freq * 1
        else:
            update_freq = update_freq / 1
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.09,
                    n_epochs=2000,
                    nkerns=[50, 50],
                    batch_size=1,
                    window_width=3,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.00065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=57,
                    max_d_length=59):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/'
    rng = numpy.random.RandomState(23455)
    train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus(
        rootPath + 'vocab.txt', rootPath + 'mc500.train.tsv_standardlized.txt',
        rootPath + 'mc500.test.tsv_standardlized.txt', max_s_length,
        maxSentLength, maxDocLength)  #vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    #     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    #     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    #     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    [
        train_data_D, train_data_Q, train_data_A, train_Y, train_Label,
        train_Length_D, train_Length_D_s, train_Length_Q, train_Length_A,
        train_leftPad_D, train_leftPad_D_s, train_leftPad_Q, train_leftPad_A,
        train_rightPad_D, train_rightPad_D_s, train_rightPad_Q,
        train_rightPad_A
    ] = train_data
    [
        test_data_D, test_data_Q, test_data_A, test_Y, test_Label,
        test_Length_D, test_Length_D_s, test_Length_Q, test_Length_A,
        test_leftPad_D, test_leftPad_D_s, test_leftPad_Q, test_leftPad_A,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    index_Q = T.lvector()
    index_A = T.lvector()
    y = T.lvector()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    len_Q = T.lscalar()
    len_A = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    left_Q = T.lscalar()
    left_A = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    right_Q = T.lscalar()
    right_A = T.lscalar()

    #wmf=T.dmatrix()
    cost_tmp = T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_Q_input = embeddings[index_Q.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A_input = embeddings[index_A.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    #     load_model_for_conv1([conv_W, conv_b])

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_Q = Conv_with_input_para(
        rng,
        input=layer0_Q_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A = Conv_with_input_para(
        rng,
        input=layer0_A_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    layer0_Q_output = debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A_output = debug_print(layer0_A.output, 'layer0_A.output')
    layer0_para = [conv_W, conv_b]

    layer1_DQ = Average_Pooling_Scan(rng,
                                     input_D=layer0_D_output,
                                     input_r=layer0_Q_output,
                                     kern=nkerns[0],
                                     left_D=left_D,
                                     right_D=right_D,
                                     left_D_s=left_D_s,
                                     right_D_s=right_D_s,
                                     left_r=left_Q,
                                     right_r=right_Q,
                                     length_D_s=len_D_s + filter_words[1] - 1,
                                     length_r=len_Q + filter_words[1] - 1,
                                     dim=maxSentLength + filter_words[1] - 1,
                                     doc_len=maxDocLength,
                                     topk=3)
    layer1_DA = Average_Pooling_Scan(rng,
                                     input_D=layer0_D_output,
                                     input_r=layer0_A_output,
                                     kern=nkerns[0],
                                     left_D=left_D,
                                     right_D=right_D,
                                     left_D_s=left_D_s,
                                     right_D_s=right_D_s,
                                     left_r=left_A,
                                     right_r=right_A,
                                     length_D_s=len_D_s + filter_words[1] - 1,
                                     length_r=len_A + filter_words[1] - 1,
                                     dim=maxSentLength + filter_words[1] - 1,
                                     doc_len=maxDocLength,
                                     topk=3)

    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    layer2_DQ = Conv_with_input_para(
        rng,
        input=layer1_DQ.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA = Conv_with_input_para(
        rng,
        input=layer1_DA.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    layer2_Q = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DQ.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_Q_output_sent_rep_Dlevel = debug_print(
        layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A_output_sent_rep_Dlevel = debug_print(
        layer2_A.output_sent_rep_Dlevel, 'layer2_A.output_sent_rep_Dlevel')
    layer2_para = [conv2_W, conv2_b]

    layer3_DQ = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DQ.output,
        input_r=layer2_Q_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA.output,
        input_r=layer2_A_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)

    #high-way
    high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1])
    transform_gate_DQ = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b),
        'transform_gate_DQ')
    transform_gate_DA = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA.output_D_sent_level_rep) + high_b),
        'transform_gate_DA')
    transform_gate_Q = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b),
        'transform_gate_Q')
    transform_gate_A = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA.output_QA_sent_level_rep) + high_b),
        'transform_gate_A')
    highW_para = [high_W, high_b]

    overall_D_Q = debug_print(
        (1.0 - transform_gate_DQ) * layer1_DQ.output_D_sent_level_rep +
        transform_gate_DQ * layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A = (
        1.0 - transform_gate_DA
    ) * layer1_DA.output_D_sent_level_rep + transform_gate_DA * layer3_DA.output_D_doc_level_rep
    overall_Q = (
        1.0 - transform_gate_Q
    ) * layer1_DQ.output_QA_sent_level_rep + transform_gate_Q * layer2_Q.output_sent_rep_Dlevel
    overall_A = (
        1.0 - transform_gate_A
    ) * layer1_DA.output_QA_sent_level_rep + transform_gate_A * layer2_A.output_sent_rep_Dlevel

    simi_sent_level = debug_print(
        cosine(
            layer1_DQ.output_D_sent_level_rep +
            layer1_DA.output_D_sent_level_rep,
            layer1_DQ.output_QA_sent_level_rep +
            layer1_DA.output_QA_sent_level_rep), 'simi_sent_level')
    simi_doc_level = debug_print(
        cosine(
            layer3_DQ.output_D_doc_level_rep +
            layer3_DA.output_D_doc_level_rep,
            layer2_Q.output_sent_rep_Dlevel + layer2_A.output_sent_rep_Dlevel),
        'simi_doc_level')
    simi_overall_level = debug_print(
        cosine(overall_D_Q + overall_D_A, overall_Q + overall_A),
        'simi_overall_level')

    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    layer4_input = debug_print(
        T.concatenate([simi_sent_level, simi_doc_level, simi_overall_level],
                      axis=1),
        'layer4_input')  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer4 = LogisticRegression(rng, input=layer4_input, n_in=3, n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer4.W**2).sum() + (high_W**2).sum() + (conv2_W**2).sum() +
        (conv_W**2).sum(),
        'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this = debug_print(layer4.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    #
    #     [train_data_D, train_data_Q, train_data_A, train_Y, train_Label,
    #                  train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A,
    #                 train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A,
    #                 train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data
    #     [test_data_D, test_data_Q, test_data_A, test_Y, test_Label,
    #                  test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A,
    #                 test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A,
    #                 test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data
    #     index = T.lscalar()
    #     index_D = T.lmatrix()   # now, x is the index matrix, must be integer
    #     index_Q = T.lvector()
    #     index_A= T.lvector()
    #
    #     y = T.lvector()
    #     len_D=T.lscalar()
    #     len_D_s=T.lvector()
    #     len_Q=T.lscalar()
    #     len_A=T.lscalar()
    #
    #     left_D=T.lscalar()
    #     left_D_s=T.lvector()
    #     left_Q=T.lscalar()
    #     left_A=T.lscalar()
    #
    #     right_D=T.lscalar()
    #     right_D_s=T.lvector()
    #     right_Q=T.lscalar()
    #     right_A=T.lscalar()
    #
    #
    #     #wmf=T.dmatrix()
    #     cost_tmp=T.dscalar()

    test_model = theano.function(
        [index],
        [layer4.errors(y), layer4_input, y, layer4.prop_for_posi],
        givens={
            index_D: test_data_D[index],  #a matrix
            index_Q: test_data_Q[index],
            index_A: test_data_A[index],
            y: test_Y[index:index + batch_size],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_Q: test_Length_Q[index],
            len_A: test_Length_A[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_Q: test_leftPad_Q[index],
            left_A: test_leftPad_A[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_Q: test_rightPad_Q[index],
            right_A: test_rightPad_A[index]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer4.params + layer2_para + layer0_para + highW_para

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A: train_data_A[index],
            y: train_Y[index:index + batch_size],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A: train_Length_A[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A: train_leftPad_A[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A: train_rightPad_A[index]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer4.errors(y), layer4_input, y],
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A: train_data_A[index],
            y: train_Y[index:index + batch_size],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A: train_Length_A[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A: train_leftPad_A[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A: train_rightPad_A[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        #         readfile=open('/mounts/data/proj/wenpeng/Dataset/SICK/train_plus_dev.txt', 'r')
        #         train_pairs=[]
        #         train_y=[]
        #         for line in readfile:
        #             tokens=line.strip().split('\t')
        #             listt=tokens[0]+'\t'+tokens[1]
        #             train_pairs.append(listt)
        #             train_y.append(tokens[2])
        #         readfile.close()
        #         writefile=open('/mounts/data/proj/wenpeng/Dataset/SICK/weights_fine_tune.txt', 'w')
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            sys.stdout.write("Training :[%6f] %% complete!\r" %
                             (batch_start * 100.0 / train_size))
            sys.stdout.flush()
            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij

            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(cost_average)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses = []
                test_y = []
                test_features = []
                test_prop = []
                for i in test_batch_start:
                    test_loss, layer3_input, y, posi_prop = test_model(i)
                    test_prop.append(posi_prop[0][0])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc = compute_test_acc(test_y, test_prop)
                #test_acc=1-test_score
                print(
                    ('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') %
                    (epoch, minibatch_index, n_train_batches, test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()
                clf = svm.SVC(
                    kernel='linear'
                )  #OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results = clf.decision_function(test_features)
                lr = linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr = lr.decision_function(test_features)

                acc_svm = compute_test_acc(test_y, results)
                acc_lr = compute_test_acc(test_y, results_lr)

                find_better = False
                if acc_svm > max_acc:
                    max_acc = acc_svm
                    best_epoch = epoch
                    find_better = True
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                if acc_lr > max_acc:
                    max_acc = acc_lr
                    best_epoch = epoch
                    find_better = True
                print '\t\t\tsvm:', acc_svm, 'lr:', acc_lr, 'nn:', test_acc, 'max:', max_acc, '(at', best_epoch, ')'


#                 if find_better==True:
#                     store_model_to_file(layer2_para, best_epoch)
#                     print 'Finished storing best conv params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #6
0
def evaluate_lenet5(learning_rate=0.08, n_epochs=2000, nkerns=[44], batch_size=1, window_width=3,
                    maxSentLength=64, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=24):
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_nonoverlap.txt', rootPath+'train_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate,maxSentLength, entailment=True)#vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test=load_extra_features(rootPath+'train_rule_features_cosine_eucli_negation_len1_len2.txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2.txt')
    discri_train, discri_test=load_extra_features(rootPath+'train_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]

    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_nonoverlap_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    mts=T.dmatrix()
    extra=T.dmatrix()
    discri=T.dmatrix()
    #wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    

    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    

    
    
    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    
    
    sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    
    linear=Linear(norm_uni_l, norm_uni_r)
    poly=Poly(norm_uni_l, norm_uni_r)
    sigmoid=Sigmoid(norm_uni_l, norm_uni_r)
    rbf=RBF(norm_uni_l, norm_uni_r)
    gesd=GESD(norm_uni_l, norm_uni_r)
    
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([mts,
                                eucli_1,uni_cosine,#linear, poly,sigmoid,rbf, gesd, #sum_uni_r-sum_uni_l,
                                layer1.output_eucli_to_simi,layer1.output_cosine, #layer1.output_vector_r-layer1.output_vector_l,
                                len_l, len_r,
                                extra
                                #discri
                                #wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=14+(2)+(2)+2+5, n_out=3)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y),layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index: index + batch_size],
            extra: extra_test[index: index + batch_size],
            discri:discri_test[index: index + batch_size]
            #wmf: wm_test[index: index + batch_size]
            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params 
    params_conv = [conv_W, conv_b]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_acc=1-test_score
                print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()
                clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                #lr=LinearRegression().fit(train_features, train_y)
                #results_lr=lr.predict(test_features)
                corr_count=0
                #corr_lr=0
                corr_neu=0
                neu_co=0
                corr_ent=0
                ent_co=0
                corr_contr=0
                contr_co=0
                test_size=len(test_y)
                for i in range(test_size):
                    if test_y[i]==0:#NEUTRAL
                        neu_co+=1
                        if results[i]==test_y[i]:
                            corr_neu+=1
                    elif test_y[i]==1:#ENTAILMENT
                        ent_co+=1
                        if results[i]==test_y[i]:
                            corr_ent+=1
                    elif test_y[i]==2:#CONTRADICTION
                        contr_co+=1
                        if results[i]==test_y[i]:
                            corr_contr+=1
                    '''
                    if results[i]==test_y[i]:
                        corr_count+=1
                        if test_y[i]==0: #NEUTRAL
                            corr_neu+=1
                    '''
                        
                    #if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                    #    corr_lr+=1
                corr_count=corr_neu+corr_ent+corr_contr
                acc=corr_count*1.0/test_size
                acc_neu=corr_neu*1.0/neu_co
                acc_ent=corr_ent*1.0/ent_co
                acc_contr=corr_contr*1.0/contr_co
                #acc_lr=corr_lr*1.0/test_size
                if acc > max_acc:
                    max_acc=acc
                    best_epoch=epoch
                if test_acc > max_acc:
                    max_acc=test_acc
                    best_epoch=epoch                 
                #if acc_lr> max_acc:
                #    max_acc=acc_lr
                #    best_epoch=epoch
                print '\t\t\tsvm acc: ', acc, ' max acc: ',    max_acc,'(at',best_epoch,')',' Neu: ',acc_neu, ' Ent: ',acc_ent, ' Contr: ',acc_contr 

            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #7
0
    layer2_A1 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA1.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)

    layer2_A1_output_sent_rep_Dlevel=debug_print(layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')

    layer3_DA1=Average_Pooling_for_Top(rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    
    #high-way
    

    transform_gate_DA1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1')
    transform_gate_DA2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b), 'transform_gate_DA2')
def evaluate_lenet5(learning_rate=0.06, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=[4,4],
                    maxSentLength=64, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=40):
    maxSentLength=max_truncate+2*(window_width[0]-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    mts=T.dmatrix()
    wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width[0])
    filter_size_2=(nkerns[0], window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))
    load_model_from_file([conv_W, conv_b])
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    layer0_para=[conv_W, conv_b]    
    
    layer1=Average_Pooling(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                     left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                      length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                       dim=maxSentLength+filter_size[1]-1, window_size=window_width[0], maxSentLength=maxSentLength)
    
    conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]))
    #load_model_from_file([conv2_W, conv2_b])
    layer2_l = Conv_with_input_para(rng, input=layer1.output_tensor_l,
            image_shape=(batch_size, 1, nkerns[0], ishape[1]),
            filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]), W=conv2_W, b=conv2_b)
    layer2_r = Conv_with_input_para(rng, input=layer1.output_tensor_r,
            image_shape=(batch_size, 1, nkerns[0], ishape[1]),
            filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]), W=conv2_W, b=conv2_b)
    layer2_para=[conv2_W, conv2_b]
        
    layer3=Average_Pooling_for_Top(rng, input_l=layer2_l.output, input_r=layer2_r.output, kern=nkerns[1],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size_2[1]-1, length_r=length_r+filter_size_2[1]-1,
                                       dim=maxSentLength+filter_size_2[1]-1)


    
    
    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    
    
    sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([#mts,
                                uni_cosine,#eucli_1_exp,#uni_sigmoid_simi,  #norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                layer1.output_cosine,  #layer1.output_eucli_to_simi_exp,#layer1.output_sigmoid_simi,#layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                layer3.output_cosine,
                                len_l, len_r,wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=(1)+(1)+(1)+2+2, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(conv2_W**2).sum(), 'L2_reg')#+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.prop_for_posi,layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index: index + batch_size],
            wmf: wm_test[index: index + batch_size]}, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ layer2_para#+layer0_para
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches/5, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    prob_i, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR=compute_map_mrr(rootPath+'test_filtered.txt', test_probs)
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                           'model %f, MRR  %f') %
                          (epoch, minibatch_index, n_train_batches,MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm=clf.decision_function(test_features)
                MAP_svm, MRR_svm=compute_map_mrr(rootPath+'test_filtered.txt', results_svm)
                
                lr=LinearRegression().fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                MAP_lr, MRR_lr=compute_map_mrr(rootPath+'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        #after each epoch, increase the batch_size
        if epoch%2==1:
            update_freq=update_freq*1
        else:
            update_freq=update_freq/1
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3,
                    maxSentLength=30, emb_size=300, hidden_size=[300,10],
                    margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    vocab_size=len(word2id)+1
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2]
    indices_train_r=indices_train[1::2]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2]
    indices_test_r=indices_test[1::2]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    train_size = len(indices_train_l)
    test_size = len(indices_test_l)
    
    train_batch_start=range(train_size)
    test_batch_start=range(test_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int32')
#     indices_train_r=T.cast(indices_train_r, 'int32')
#     indices_test_l=T.cast(indices_test_l, 'int32')
#     indices_test_r=T.cast(indices_test_r, 'int32')
    


    rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng)
#     rand_values[0]=numpy.array(numpy.zeros(emb_size))
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True)      
    

    
    # allocate symbolic variables for the data
#     index = T.iscalar()
    x_index_l = T.imatrix()   # now, x is the index matrix, must be integer
    x_index_r = T.imatrix()
    y = T.ivector()  
    left_l=T.iscalar()
    right_l=T.iscalar()
    left_r=T.iscalar()
    right_r=T.iscalar()
    length_l=T.iscalar()
    length_r=T.iscalar()
    norm_length_l=T.fscalar()
    norm_length_r=T.fscalar()
    mts=T.fmatrix()
    wmf=T.fmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))
    conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0]))
    layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0]))
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    

    
    
    
    
    
    sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size))
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size))
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    HL_layer_1_input=T.concatenate([
#                                 mts, 
                                eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                uni_cosine,
#                                 sum_uni_l,
#                                 sum_uni_r,
#                                 sum_uni_l+sum_uni_r,
                                1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)),
                                cosine(layer0_l_output_maxpool, layer0_r_output_maxpool),
                                layer0_l_output_maxpool,
                                layer0_r_output_maxpool,
                                T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10),
                                
                                layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                layer1.output_cosine,
                                layer1.output_vector_l,
                                layer1.output_vector_r,
                                T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10),
#                                 len_l, len_r
                                layer1.output_attentions
#                                 wmf,
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input,
                                mts, len_l, len_r
#                                 wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_size=1+1+   1+1+3* nkerns[0]   +1+1+3*nkerns[0]+10*10
    
    HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2
    
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh)
    
    LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1)
    LR_layer_input_with_extra=T.concatenate([HL_layer_2.output,  HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output,
    
    LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2)
#     LR_layer_input=HL_layer_2.output
#     LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2)

#     layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
#     diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix)
    cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg
    

    test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True)



    params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        clipped_grad = T.clip(grad_i, -0.5, 0.5)
        acc = acc_i + T.sqr(clipped_grad)
        updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True)

    train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is


    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    nn_max_acc=0.0
    best_iter=0
    cost_tmp=0.0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data

        for index in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * train_size + minibatch_index +1

            minibatch_index=minibatch_index+1

#             if iter%update_freq != 0:
#                 cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
#                 #print 'cost_ij: ', cost_ij
#                 cost_tmp+=cost_ij
#                 error_sum+=error_ij
#             else:

            cost_i, error_i= train_model(indices_train_l[index: index + batch_size],
                                                              indices_train_r[index: index + batch_size],
                                                              trainY[index: index + batch_size],
                                                              trainLeftPad_l[index],
                                                              trainRightPad_l[index],
                                                              trainLeftPad_r[index],
                                                              trainRightPad_r[index],
                                                              trainLengths_l[index],
                                                              trainLengths_r[index],
                                                              normalized_train_length_l[index],
                                                              normalized_train_length_r[index],
                                                              mt_train[index: index + batch_size],
                                                              wm_train[index: index + batch_size])
            cost_tmp+=cost_i
            if iter < 6000 and iter %100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
            if iter >= 6000 and iter % 100 == 0:
#             if iter%100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
                test_losses=[]
                test_y=[]
                test_features=[]
                for index in test_batch_start:
                    test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size],
                                                                  indices_test_r[index: index + batch_size],
                                                                  testY[index: index + batch_size],
                                                                  testLeftPad_l[index],
                                                                  testRightPad_l[index],
                                                                  testLeftPad_r[index],
                                                                  testRightPad_r[index],
                                                                  testLengths_l[index],
                                                                  testLengths_r[index],
                                                                  normalized_test_length_l[index],
                                                                  normalized_test_length_r[index],
                                                                  mt_test[index: index + batch_size],
                                                                  wm_test[index: index + batch_size])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_acc = (1-test_score) * 100.
                if test_acc > nn_max_acc:
                    nn_max_acc = test_acc
                print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc

                #now, see the results of svm
                if use_svm:
                    train_y=[]
                    train_features=[]
                    for index in train_batch_start: 
                        cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size],
                                                                  indices_train_r[index: index + batch_size],
                                                                  trainY[index: index + batch_size],
                                                                  trainLeftPad_l[index],
                                                                  trainRightPad_l[index],
                                                                  trainLeftPad_r[index],
                                                                  trainRightPad_r[index],
                                                                  trainLengths_l[index],
                                                                  trainLengths_r[index],
                                                                  normalized_train_length_l[index],
                                                                  normalized_train_length_r[index],
                                                                  mt_train[index: index + batch_size],
                                                                  wm_train[index: index + batch_size])
                        train_y.append(y[0])
                        train_features.append(layer3_input[0])
                        #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n')
                    #write_feature.close()
     
                    clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                    clf.fit(train_features, train_y)
                    results=clf.predict(test_features)
                    lr=LinearRegression().fit(train_features, train_y)
                    results_lr=lr.predict(test_features)
                    corr_count=0
                    corr_lr=0
                    test_size=len(test_y)
                    for i in range(test_size):
                        if results[i]==test_y[i]:
                            corr_count+=1
                        if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                            corr_lr+=1
                    acc=corr_count*1.0/test_size
                    acc_lr=corr_lr*1.0/test_size
                    if acc > max_acc:
                        max_acc=acc
                        best_iter=iter
                    if acc_lr> max_acc:
                        max_acc=acc_lr
                        best_iter=iter
                    print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ',    max_acc , ' at iter: ', best_iter

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #10
0
def evaluate_lenet5(file_name,
                    input_filename,
                    model_filename,
                    learning_rate=0.001,
                    n_epochs=2000,
                    nkerns=[90, 90],
                    batch_size=1,
                    window_width=2,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=128,
                    max_d_length=128,
                    margin=0.3):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    f = open(file_name, 'w')
    f.write("model options " + str(model_options) + '\n')
    #rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/';
    rng = numpy.random.RandomState(23455)
    train_data, _train_Label, train_size, test_data, _test_Label, test_size, vocab_size = load_MCTest_corpus_DPN(
        'vocab_table_wenyan.txt', input_filename, input_filename, max_s_length,
        maxSentLength, maxDocLength)  #vocab_size contain train, dev and test
    f.write('train_size : ' + str(train_size))
    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    #     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    #     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    #     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

    # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label),
    #          numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4),
    #         numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4),
    #         numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)]
    # return results, line_control
    [
        train_data_D, train_data_A1, train_Label, train_Length_D,
        train_Length_D_s, train_Length_A1, train_leftPad_D, train_leftPad_D_s,
        train_leftPad_A1, train_rightPad_D, train_rightPad_D_s,
        train_rightPad_A1
    ] = train_data
    [
        test_data_D, test_data_A1, test_Label, test_Length_D, test_Length_D_s,
        test_Length_A1, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_A1
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values, 'vectors_wenyan2.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    #   index_Q = T.lvector()
    index_A1 = T.lvector()
    #    index_A2= T.lvector()
    #     index_A3= T.lvector()
    #     index_A4= T.lvector()
    y = T.lscalar()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    #     len_Q=T.lscalar()
    len_A1 = T.lscalar()
    #    len_A2=T.lscalar()
    #     len_A3=T.lscalar()
    #     len_A4=T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    #     left_Q=T.lscalar()
    left_A1 = T.lscalar()
    #    left_A2=T.lscalar()
    #     left_A3=T.lscalar()
    #     left_A4=T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    #     right_Q=T.lscalar()
    right_A1 = T.lscalar()
    #    right_A2=T.lscalar()
    #     right_A3=T.lscalar()
    #     right_A4=T.lscalar()

    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    f.write('... building the model\n')

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    #layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    #     layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    #     layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    layer0_para = [conv_W, conv_b]
    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    layer2_para = [conv2_W, conv2_b]
    high_W, high_b = create_highw_para(
        rng, nkerns[0], nkerns[1]
    )  # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para = [high_W, high_b]
    params = layer2_para + layer0_para + highW_para  #+[embeddings]
    #load_model(params)

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    #     layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A1 = Conv_with_input_para(
        rng,
        input=layer0_A1_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    #layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input,
    # image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #  filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    #     layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    #     layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    #     layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output')
    #layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output')
    #     layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output')
    #     layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output')

    #     layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA1 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A1_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A1,
                                      right_r=right_A1,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A1 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=1)
    #layer1_DA2=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0],
    # left_D=left_D, right_D=right_D,
    # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2,
    # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A2+filter_words[1]-1,
    #  dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    #     layer1_DA3=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_A3+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    #     layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)

    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    #     layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA1 = Conv_with_input_para(
        rng,
        input=layer1_DA1.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #layer2_DA2 = Conv_with_input_para(rng, input=layer1_DA2.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #  image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #  filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_DA3 = Conv_with_input_para(rng, input=layer1_DA3.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    #     layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA1.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #   image_shape=(batch_size, 1, nkerns[0], 1),
    # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A1_output_sent_rep_Dlevel = debug_print(
        layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    #     layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    #     layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')
    #     layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel')

    #     layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA1 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA1.output,
        input_r=layer2_A1_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=1)
    #layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1],
    #   left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #   length_l=len_D+filter_sents[1]-1, length_r=1,
    #    dim=maxDocLength+filter_sents[1]-1, topk=3)
    #     layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    #     layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)

    #high-way

    #     transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b),
        'transform_gate_DA1')
    transform_gate_A1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b),
        'transform_gate_A1')
    # transform_gate_A2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2')
    #     transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3')
    #     transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4')

    #     overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A1 = (
        1.0 - transform_gate_DA1
    ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep
    # overall_D_A2=(1.0-transform_gate_DA2)*layer1_DA2.output_D_sent_level_rep+transform_gate_DA2*layer3_DA2.output_D_doc_level_rep
    #     overall_D_A3=(1.0-transform_gate_DA3)*layer1_DA3.output_D_sent_level_rep+transform_gate_DA3*layer3_DA3.output_D_doc_level_rep
    #     overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep

    #     overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A1 = (
        1.0 - transform_gate_A1
    ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel
    #overall_A2=(1.0-transform_gate_A2)*layer1_DA2.output_QA_sent_level_rep+transform_gate_A2*layer2_A2.output_sent_rep_Dlevel
    #     overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel
    #     overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel

    simi_sent_level1 = debug_print(
        cosine(layer1_DA1.output_D_sent_level_rep,
               layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')
    #simi_sent_level2=debug_print(cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2')
    #     simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3')
    #     simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4')

    simi_doc_level1 = debug_print(
        cosine(layer3_DA1.output_D_doc_level_rep,
               layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')
    #simi_doc_level2=debug_print(cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2')
    #     simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3')
    #     simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4')

    simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1),
                                      'simi_overall_level1')
    #simi_overall_level2=debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2')
    #     simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3')
    #     simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4')

    #     simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1
    #     simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2

    simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0
    #simi_1 = simi_doc_level1
    #simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0
    #     simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0
    #     simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0
    logistic_w, logistic_b = create_logistic_para(rng, 1, 2)
    logistic_para = [logistic_w, logistic_b]
    sent_w, sent_b = create_logistic_para(rng, 1, 2)
    doc_w, doc_b = create_logistic_para(rng, 1, 2)
    sent_para = [sent_w, sent_b]
    doc_para = [doc_w, doc_b]
    params += logistic_para
    params += sent_para
    params += doc_para

    load_model(params, model_filename)
    simi_sent = T.dot(sent_w, simi_sent_level1) + sent_b.dimshuffle(0, 'x')
    simi_sent = simi_sent.dimshuffle(1, 0)
    simi_sent = T.nnet.softmax(simi_sent)
    tmp_sent = T.log(simi_sent)

    simi_doc = T.dot(doc_w, simi_doc_level1) + doc_b.dimshuffle(0, 'x')
    simi_doc = simi_doc.dimshuffle(1, 0)
    simi_doc = T.nnet.softmax(simi_doc)
    tmp_doc = T.log(simi_doc)
    #cost = margin - simi_1
    simi_overall = T.dot(logistic_w,
                         simi_overall_level1) + logistic_b.dimshuffle(0, 'x')
    simi_overall = simi_overall.dimshuffle(1, 0)

    simi_overall = T.nnet.softmax(simi_overall)
    predict = T.argmax(simi_overall, axis=1)
    tmp_overall = T.log(simi_overall)
    cost = -(tmp_overall[0][y] + tmp_doc[0][y] + tmp_sent[0][y]) / 3.0
    L2_reg = (conv2_W**2).sum() + (conv_W**2).sum() + (logistic_w**2).sum() + (
        high_W**2).sum()
    cost = cost + L2_weight * L2_reg
    #simi_1 = [simi_overall,simi_doc,simi_sent]
    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    #     #only use overall_simi
    #     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    #     posi_simi=simi_overall_level1
    #     nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #use ensembled simi
    #     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
    #     cost=T.maximum(0.0, margin+simi_2-simi_1)
    #cost=T.maximum(0.0, margin+simi_sent_level2-simi_sent_level1)+T.maximum(0.0, margin+simi_doc_level2-simi_doc_level1)+T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)
    #     posi_simi=simi_1
    #     nega_simi=simi_2

    #L2_reg =debug_print((high_W**2).sum()+(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    #cost=debug_print(cost+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index],
        [cost, simi_overall, simi_doc, simi_sent, predict],
        givens={
            index_D: test_data_D[index],  #a matrix
            #             index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            y: test_Label[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            #             len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            #            len_A2: test_Length_A2[index],
            #             len_A3: test_Length_A3[index],
            #             len_A4: test_Length_A4[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            #             left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            #            left_A2: test_leftPad_A2[index],
            #             left_A3: test_leftPad_A3[index],
            #             left_A4: test_leftPad_A4[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            #             right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
        },
        on_unused_input='ignore')

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))


#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         acc = acc_i + T.sqr(grad_i)
#         if param_i == embeddings:
#             updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size)))))   #AdaGrad
#         else:
#             updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))

    train_model = theano.function(
        [index],
        [cost, simi_overall, simi_doc, simi_sent, predict],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            #            index_A2: train_data_A2[index],
            #             index_A3: train_data_A3[index],
            #             index_A4: train_data_A4[index],
            y: train_Label[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            #            len_A2: train_Length_A2[index],
            #             len_A3: train_Length_A3[index],
            #             len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            #            left_A2: train_leftPad_A2[index],
            #             left_A3: train_leftPad_A3[index],
            #             left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            #            right_A2: train_rightPad_A2[index]
            #             right_A3: train_rightPad_A3[index],
            #             right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    f.write('... training\n')
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch
    cost, simi_overall, simi_doc, simi_sent, predict = test_model(0)
    cost, simi_overall1, simi_doc, simi_sent, predict = test_model(1)
    cost, simi_overall2, simi_doc, simi_sent, predict = test_model(2)
    cost, simi_overall3, simi_doc, simi_sent, predict = test_model(3)
    return simi_overall, simi_overall1, simi_overall2, simi_overall3
    '''