def create_CNN_params(rng):
     conv_W_2_pre, conv_b_2_pre = create_conv_para(
         rng, filter_shape=gate_filter_shape)
     conv_W_2_gate, conv_b_2_gate = create_conv_para(
         rng, filter_shape=gate_filter_shape)
     conv_W_2, conv_b_2 = create_conv_para(rng,
                                           filter_shape=(hidden_size[1], 1,
                                                         hidden_size[0],
                                                         filter_size[0]))
     conv_W_2_context, conv_b_2_context = create_conv_para(
         rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))
     return conv_W_2_pre, conv_b_2_pre, conv_W_2_gate, conv_b_2_gate, conv_W_2, conv_b_2, conv_W_2_context, conv_b_2_context
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3],
                    sent_len=40,
                    claim_len=40,
                    cand_size=10,
                    hidden_size=[300, 300],
                    max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train(
        sent_len, claim_len, cand_size)
    train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev(
        sent_len, claim_len, cand_size, word2id)
    test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)

    train_sents = np.asarray(train_sents, dtype='int32')
    train_3th_sents = np.asarray(train_3th_sents, dtype='int32')
    joint_train_sents = np.concatenate((train_sents, train_3th_sents))
    test_sents = np.asarray(test_sents, dtype='int32')
    test_3th_sents = np.asarray(test_3th_sents, dtype='int32')
    joint_test_sents = np.concatenate((test_sents, test_3th_sents))

    train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX)
    train_3th_sent_masks = np.asarray(train_3th_sent_masks,
                                      dtype=theano.config.floatX)
    joint_train_sent_masks = np.concatenate(
        (train_sent_masks, train_3th_sent_masks))
    test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX)
    test_3th_sent_masks = np.asarray(test_3th_sent_masks,
                                     dtype=theano.config.floatX)
    joint_test_sent_masks = np.concatenate(
        (test_sent_masks, test_3th_sent_masks))

    train_sent_labels = np.asarray(train_sent_labels, dtype='int32')
    train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32')
    joint_train_sent_labels = np.concatenate(
        (train_sent_labels, train_3th_sent_labels))
    test_sent_labels = np.asarray(test_sent_labels, dtype='int32')
    test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32')
    joint_test_sent_labels = np.concatenate(
        (test_sent_labels, test_3th_sent_labels))

    train_claims = np.asarray(train_claims, dtype='int32')
    train_3th_claims = np.asarray(train_3th_claims, dtype='int32')
    joint_train_claims = np.concatenate((train_claims, train_3th_claims))
    test_claims = np.asarray(test_claims, dtype='int32')
    test_3th_claims = np.asarray(test_3th_claims, dtype='int32')
    joint_test_claims = np.concatenate((test_claims, test_3th_claims))

    train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX)
    train_3th_claim_mask = np.asarray(train_3th_claim_mask,
                                      dtype=theano.config.floatX)
    joint_train_claim_mask = np.concatenate(
        (train_claim_mask, train_3th_claim_mask))
    test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX)
    test_3th_claim_mask = np.asarray(test_3th_claim_mask,
                                     dtype=theano.config.floatX)
    joint_test_claim_mask = np.concatenate(
        (test_claim_mask, test_3th_claim_mask))

    train_labels = np.asarray(train_labels, dtype='int32')
    train_3th_labels = np.asarray(train_3th_labels, dtype='int32')
    joint_train_labels = np.concatenate((train_labels, train_3th_labels))
    test_labels = np.asarray(test_labels, dtype='int32')
    test_3th_labels = np.asarray(test_3th_labels, dtype='int32')
    joint_test_labels = np.concatenate((test_labels, test_3th_labels))

    joint_train_size = len(joint_train_claims)
    joint_test_size = len(joint_test_claims)
    train_size = len(train_claims)
    test_size = len(test_claims)
    test_3th_size = len(test_3th_claims)
    vocab_size = len(word2id) + 1
    print 'joint_train size: ', joint_train_size, ' joint_test size: ', joint_test_size
    print 'train size: ', train_size, ' test size: ', test_size
    print 'vocab size: ', vocab_size

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    sents_mask = T.ftensor3()
    sents_labels = T.imatrix()  #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()

    joint_sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    joint_sents_mask = T.ftensor3()
    joint_sents_labels = T.imatrix()  #(batch, cand_size)
    joint_claim_ids = T.imatrix()  #(batch, claim_len)
    joint_claim_mask = T.fmatrix()
    joint_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, att_conv_W, att_conv_b, conv_W_context]

    conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_sents,
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings = conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1),
                               cand_size,
                               axis=1)

    # concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb ], axis=2)
    # concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2))
    concate_claim_sent = T.concatenate([
        batch_claim_emb, batch_sent_emb,
        T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x')
    ],
                                       axis=2)
    concate_2_matrix = concate_claim_sent.reshape(
        (batch_size * cand_size, hidden_size[0] * 2 + 1))

    LR_input = concate_2_matrix
    LR_input_size = hidden_size[0] * 2 + 1
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 1, LR_input_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix)
    # loss = -T.mean(T.log(prob_pos))
    #f1 as loss
    batch_overlap = T.sum(sents_labels * inter_matrix, axis=1)
    batch_recall = batch_overlap / T.sum(sents_labels, axis=1)
    batch_precision = batch_overlap / T.sum(inter_matrix, axis=1)
    batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall +
                                                       batch_precision)
    loss = -T.mean(T.log(batch_f1))
    # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean()
    '''
    training task2, predict 3 labels
    '''
    joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    joint_embed_input_claim = init_embeddings[
        joint_claim_ids.flatten()].reshape(
            (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)
    joint_conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_sents,
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_sent_embeddings = joint_conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    joint_batch_sent_emb = joint_sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))
    joint_premise_emb = T.sum(joint_batch_sent_emb *
                              joint_sents_labels.dimshuffle(0, 1, 'x'),
                              axis=1)  #(batch, hidden_size)

    joint_conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_claim,
        mask_matrix=joint_claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_claim_embeddings = joint_conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    joint_premise_hypo_emb = T.concatenate(
        [joint_premise_emb, joint_claim_embeddings],
        axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    joint_sents_dot = T.batched_dot(
        joint_sents_tensor3, joint_sents_tensor3.dimshuffle(
            0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    joint_sents_dot_2_matrix = T.nnet.softmax(
        joint_sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    joint_sents_context = T.batched_dot(
        joint_sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        joint_sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        joint_add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0),
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    fine_max = T.concatenate([
        T.max(masked_sents_attconv, axis=1),
        T.max(masked_claim_attconv, axis=1)
    ],
                             axis=1)  #(batch, 2*hidden)
    # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    joint_loss = joint_layer_LR.negative_log_likelihood(
        joint_labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''
    # binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size

    masked_inter_matrix = inter_matrix * sents_labels  #(batch, cand_size)
    test_premise_emb = T.sum(batch_sent_emb *
                             masked_inter_matrix.dimshuffle(0, 1, 'x'),
                             axis=1)
    test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings],
                                          axis=1)

    #fine-maxsum
    sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(
        0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    sents_dot_2_matrix = T.nnet.softmax(
        sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    sents_context = T.batched_dot(
        sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    add_sents_context = embed_input_sents + sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_fine_max = T.concatenate([
        T.max(test_masked_sents_attconv, axis=1),
        T.max(test_masked_claim_attconv, axis=1)
    ],
                                  axis=1)  #(batch, 2*hidden)
    # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)

    test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max],
                                  axis=1)
    test_LR_input_size = joint_LR_input_size

    test_layer_LR = LogisticRegression(
        rng,
        input=test_LR_input,
        n_in=test_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector

    params = [init_embeddings] + NN_para + LR_para + joint_LR_para
    cost = loss + joint_loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids,
        joint_claim_mask, joint_labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    joint_n_train_batches = joint_train_size / batch_size
    joint_train_batch_start = list(
        np.arange(joint_n_train_batches) *
        batch_size) + [joint_train_size - batch_size]
    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    n_test_3th_batches = test_3th_size / batch_size
    test_3th_batch_start = list(np.arange(n_test_3th_batches) *
                                batch_size) + [test_3th_size - batch_size]

    max_acc = 0.0
    max_test_f1 = 0.0
    max_acc_full_evi = 0.0

    cost_i = 0.0
    joint_train_indices = range(joint_train_size)
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            joint_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for joint_batch_id in joint_train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1
            iter_accu += 1
            joint_train_id_batch = joint_train_indices[
                joint_batch_id:joint_batch_id + batch_size]
            for i in range(3):
                batch_id = random.choice(train_batch_start)
                train_id_batch = train_indices[batch_id:batch_id + batch_size]
                cost_i += train_model(
                    train_sents[train_id_batch],
                    train_sent_masks[train_id_batch],
                    train_sent_labels[train_id_batch],
                    train_claims[train_id_batch],
                    train_claim_mask[train_id_batch],
                    #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels
                    joint_train_sents[joint_train_id_batch],
                    joint_train_sent_masks[joint_train_id_batch],
                    joint_train_sent_labels[joint_train_id_batch],
                    joint_train_claims[joint_train_id_batch],
                    joint_train_claim_mask[joint_train_id_batch],
                    joint_train_labels[joint_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                f1_sum = 0.0
                error_sum = 0.0
                full_evi = 0
                predictions = []
                for test_batch_id in test_batch_start:  # for each test batch
                    batch_prob, error_i, pred_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_sent_masks[test_batch_id:test_batch_id +
                                        batch_size],
                        test_sent_labels[test_batch_id:test_batch_id +
                                         batch_size],
                        test_claims[test_batch_id:test_batch_id + batch_size],
                        test_claim_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_labels[test_batch_id:test_batch_id + batch_size])
                    error_sum += error_i
                    batch_sent_labels = test_sent_labels[
                        test_batch_id:test_batch_id + batch_size]
                    batch_sent_names = test_sent_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_names = test_ground_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_labels = test_labels[
                        test_batch_id:test_batch_id + batch_size]
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(
                            batch_ground_labels[i])
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        pred_sent_names = []
                        gold_sent_names = batch_ground_names[i]
                        zipped = [(batch_prob[i, k], batch_sent_labels[i][k],
                                   batch_sent_names[i][k])
                                  for k in range(cand_size)]
                        sorted_zip = sorted(zipped,
                                            key=lambda x: x[0],
                                            reverse=True)
                        for j in range(cand_size):
                            triple = sorted_zip[j]
                            if triple[1] == 1.0:
                                '''
                                we should consider a rank, instead of binary
                                if triple[0] >0.5: can control the recall, influence the strict_acc
                                '''
                                if triple[0] > 0.5:
                                    # pred_sent_names.append(batch_sent_names[i][j])
                                    pred_sent_names.append(triple[2])
                                # if len(pred_sent_names) == max_pred_pick:
                                #     break
                        instance_i['predicted_evidence'] = pred_sent_names
                        # print 'pred_sent_names:',pred_sent_names
                        # print 'gold_sent_names:',gold_sent_names
                        new_gold_names = []
                        for gold_name in gold_sent_names:
                            new_gold_names.append([None, None] + gold_name)
                        instance_i['evidence'] = [new_gold_names]
                        predictions.append(instance_i)
                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                # test_f1=f1_sum/(len(test_batch_start)*batch_size)

                for test_batch_id in test_3th_batch_start:  # for each test batch
                    _, error_i, pred_i = test_model(
                        test_3th_sents[test_batch_id:test_batch_id +
                                       batch_size],
                        test_3th_sent_masks[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_sent_labels[test_batch_id:test_batch_id +
                                             batch_size],
                        test_3th_claims[test_batch_id:test_batch_id +
                                        batch_size],
                        test_3th_claim_mask[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_labels[test_batch_id:test_batch_id +
                                        batch_size])
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(2)
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        instance_i['predicted_evidence'] = []
                        instance_i['evidence'] = []
                        predictions.append(instance_i)

                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.001, n_epochs=2000, nkerns=[90,90], batch_size=1, window_width=2,
                    maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200,
                    L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=0.2):
    maxSentLength=max_s_length+2*(window_width-1)
    maxDocLength=max_d_length+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/';
    rng = numpy.random.RandomState(23455)
    train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DPNQ(rootPath+'vocab_DPNQ.txt', rootPath+'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', rootPath+'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
#     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
#     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
#     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

# results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label), 
#          numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4),
#         numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4),
#         numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)]
# return results, line_control
    [train_data_D, train_data_A1, train_data_A2, train_data_A3, train_Label, 
                 train_Length_D,train_Length_D_s, train_Length_A1, train_Length_A2, train_Length_A3,
                train_leftPad_D,train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3,
                train_rightPad_D,train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3]=train_data
    [test_data_D, test_data_A1, test_data_A2, test_data_A3, test_Label, 
                 test_Length_D,test_Length_D_s, test_Length_A1, test_Length_A2, test_Length_A3,
                test_leftPad_D,test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3,
                test_rightPad_D,test_rightPad_D_s, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3]=test_data                


    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int64')
#     indices_train_r=T.cast(indices_train_r, 'int64')
#     indices_test_l=T.cast(indices_test_l, 'int64')
#     indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DPNQ_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()   # now, x is the index matrix, must be integer
#     index_Q = T.lvector()
    index_A1= T.lvector()
    index_A2= T.lvector()
    index_A3= T.lvector()
#     index_A4= T.lvector()
#     y = T.lvector()  
    
    len_D=T.lscalar()
    len_D_s=T.lvector()
#     len_Q=T.lscalar()
    len_A1=T.lscalar()
    len_A2=T.lscalar()
    len_A3=T.lscalar()
#     len_A4=T.lscalar()

    left_D=T.lscalar()
    left_D_s=T.lvector()
#     left_Q=T.lscalar()
    left_A1=T.lscalar()
    left_A2=T.lscalar()
    left_A3=T.lscalar()
#     left_A4=T.lscalar()

    right_D=T.lscalar()
    right_D_s=T.lvector()
#     right_Q=T.lscalar()
    right_A1=T.lscalar()
    right_A2=T.lscalar()
    right_A3=T.lscalar()
#     right_A4=T.lscalar()
        


    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength) # doc shape
    filter_words=(emb_size,window_width)
    filter_sents=(nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
#     layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
#     layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
        
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]))
    layer0_para=[conv_W, conv_b] 
    conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
    layer2_para=[conv2_W, conv2_b]
    high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para=[high_W, high_b]
    params = layer2_para+layer0_para+highW_para#+[embeddings]
    #load_model(params)

    layer0_D = Conv_with_input_para(rng, input=layer0_D_input,
            image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
#     layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
#             image_shape=(batch_size, 1, ishape[0], ishape[1]),
#             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A1 = Conv_with_input_para(rng, input=layer0_A1_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
#     layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input,
#             image_shape=(batch_size, 1, ishape[0], ishape[1]),
#             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    
    layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output')
#     layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A1_output=debug_print(layer0_A1.output, 'layer0_A1.output')
    layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output')
    layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output')
#     layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output')
       

#     layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
#                                       left_D=left_D, right_D=right_D,
#                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, 
#                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
#                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA1=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                     left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A1, right_r=right_A1, 
                      length_D_s=len_D_s+filter_words[1]-1, length_r=len_A1+filter_words[1]-1,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA2=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                     left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2, 
                      length_D_s=len_D_s+filter_words[1]-1, length_r=len_A2+filter_words[1]-1,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA3=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                     left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3, 
                      length_D_s=len_D_s+filter_words[1]-1, length_r=len_A3+filter_words[1]-1,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
#     layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0],
#                                       left_D=left_D, right_D=right_D,
#                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4, 
#                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1,
#                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    
    
    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
#     layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
#             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA1 = Conv_with_input_para(rng, input=layer1_DA1.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA2 = Conv_with_input_para(rng, input=layer1_DA2.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA3 = Conv_with_input_para(rng, input=layer1_DA3.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
#     layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
#             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #conv single Q and A into doc level with same conv weights
#     layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
#             image_shape=(batch_size, 1, nkerns[0], 1),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA1.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
#     layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
#             image_shape=(batch_size, 1, nkerns[0], 1),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
#     layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A1_output_sent_rep_Dlevel=debug_print(layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')
#     layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel')
    
    
#     layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
#                      left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
#                       length_l=len_D+filter_sents[1]-1, length_r=1,
#                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA1=Average_Pooling_for_Top(rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
#     layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1],
#                      left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
#                       length_l=len_D+filter_sents[1]-1, length_r=1,
#                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    
    #high-way
    
#     transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1')
    transform_gate_DA2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b), 'transform_gate_DA2')
    transform_gate_DA3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b), 'transform_gate_DA3')
#     transform_gate_DA4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b), 'transform_gate_DA4')
#     transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q')
    transform_gate_A1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1')
    transform_gate_A2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2')
#     transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3')
#     transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4')
    
        
#     overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A1=(1.0-transform_gate_DA1)*layer1_DA1.output_D_sent_level_rep+transform_gate_DA1*layer3_DA1.output_D_doc_level_rep
    overall_D_A2=(1.0-transform_gate_DA2)*layer1_DA2.output_D_sent_level_rep+transform_gate_DA2*layer3_DA2.output_D_doc_level_rep
    overall_D_A3=(1.0-transform_gate_DA3)*layer1_DA3.output_D_sent_level_rep+transform_gate_DA3*layer3_DA3.output_D_doc_level_rep
#     overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep
    
#     overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A1=(1.0-transform_gate_A1)*layer1_DA1.output_QA_sent_level_rep+transform_gate_A1*layer2_A1.output_sent_rep_Dlevel
    overall_A2=(1.0-transform_gate_A2)*layer1_DA2.output_QA_sent_level_rep+transform_gate_A2*layer2_A2.output_sent_rep_Dlevel
#     overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel
#     overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel
    
    simi_sent_level1=debug_print(cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')
    simi_sent_level2=debug_print(cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2')
#     simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3')
#     simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4')
  
  
    simi_doc_level1=debug_print(cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')
    simi_doc_level2=debug_print(cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2')
#     simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3')
#     simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4')

    
    simi_overall_level1=debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1')
    simi_overall_level2=debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2')
#     simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3')
#     simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4')

#     simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1
#     simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2
 
    simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0
    simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0
#     simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0
#     simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0 
    


#     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))
 
#     #only use overall_simi    
#     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
#     posi_simi=simi_overall_level1
#     nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #use ensembled simi
#     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
#     cost=T.maximum(0.0, margin+simi_2-simi_1)
    simi_PQ=cosine(layer1_DA1.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep)
    simi_NQ=cosine(layer1_DA2.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep)
    #bad matching at overall level
#     simi_PQ=cosine(overall_A1, overall_D_A3)
#     simi_NQ=cosine(overall_A2, overall_D_A3)
    match_cost=T.maximum(0.0, margin+simi_NQ-simi_PQ) 
    cost=T.maximum(0.0, margin+simi_sent_level2-simi_sent_level1)+T.maximum(0.0, margin+simi_doc_level2-simi_doc_level1)+T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)
    cost=cost#+match_cost
#     posi_simi=simi_1
#     nega_simi=simi_2


    
    L2_reg =debug_print((high_W**2).sum()+3*(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost=debug_print(cost+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    


    
    test_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2],
          givens={
            index_D: test_data_D[index], #a matrix
#             index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
#             index_A4: test_data_A4[index],

            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
#             len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
#             len_A4: test_Length_A4[index],

            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
#             left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
#             left_A4: test_leftPad_A4[index],
        
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
#             right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index]
#             right_A4: test_rightPad_A4[index]
            
            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
 
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         acc = acc_i + T.sqr(grad_i)
#         if param_i == embeddings:
#             updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size)))))   #AdaGrad
#         else:
#             updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))    
  
    train_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2], updates=updates,
          givens={
            index_D: train_data_D[index],
#             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
#             index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
#             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
#             len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
#             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
#             left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
#             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index]
#             right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2],
          givens={
            index_D: train_data_D[index],
#             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
#             index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
#             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
#             len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
#             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
#             left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
#             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index]
#             right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data


        posi_train_sent=[]
        nega_train_sent=[]
        posi_train_doc=[]
        nega_train_doc=[]
        posi_train_overall=[]
        nega_train_overall=[]
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
            sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) )
            sys.stdout.flush()
            minibatch_index=minibatch_index+1
            
            cost_average, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2= train_model(batch_start)
            posi_train_sent.append(simi_sent_level1)
            nega_train_sent.append(simi_sent_level2)
            posi_train_doc.append(simi_doc_level1)
            nega_train_doc.append(simi_doc_level2)
            posi_train_overall.append(simi_overall_level1)
            nega_train_overall.append(simi_overall_level2)
            if iter % n_train_batches == 0:
                corr_train_sent=compute_corr(posi_train_sent, nega_train_sent)
                corr_train_doc=compute_corr(posi_train_doc, nega_train_doc)
                corr_train_overall=compute_corr(posi_train_overall, nega_train_overall)
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train_sent*300.0/train_size)+' '+str(corr_train_doc*300.0/train_size)+' '+str(corr_train_overall*300.0/train_size)

            
            if iter % validation_frequency == 0:
                posi_test_sent=[]
                nega_test_sent=[]
                posi_test_doc=[]
                nega_test_doc=[]
                posi_test_overall=[]
                nega_test_overall=[]
                for i in test_batch_start:
                    cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2=test_model(i)
                    posi_test_sent.append(simi_sent_level1)
                    nega_test_sent.append(simi_sent_level2)
                    posi_test_doc.append(simi_doc_level1)
                    nega_test_doc.append(simi_doc_level2)
                    posi_test_overall.append(simi_overall_level1)
                    nega_test_overall.append(simi_overall_level2)
                corr_test_sent=compute_corr(posi_test_sent, nega_test_sent)
                corr_test_doc=compute_corr(posi_test_doc, nega_test_doc)
                corr_test_overall=compute_corr(posi_test_overall, nega_test_overall)

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc_sent=corr_test_sent*1.0/(test_size/3.0)
                test_acc_doc=corr_test_doc*1.0/(test_size/3.0)
                test_acc_overall=corr_test_overall*1.0/(test_size/3.0)
                #test_acc=1-test_score
#                 print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
#                            'model %f %%') %
#                           (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                print '\t\t\tepoch', epoch, ', minibatch', minibatch_index, '/', n_train_batches, 'test acc of best model', test_acc_sent*100,test_acc_doc*100,test_acc_overall*100 
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 

  
                find_better=False
                if test_acc_sent > max_acc:
                    max_acc=test_acc_sent
                    best_epoch=epoch    
                    find_better=True     
                if test_acc_doc > max_acc:
                    max_acc=test_acc_doc
                    best_epoch=epoch    
                    find_better=True 
                if test_acc_overall > max_acc:
                    max_acc=test_acc_overall
                    best_epoch=epoch    
                    find_better=True         
                print '\t\t\tmax:',    max_acc,'(at',best_epoch,')'
                if find_better==True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'  

            if patience <= iter:
                done_looping = True
                break
        
        
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
        #writefile.close()
   
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.085,
                    n_epochs=2000,
                    nkerns=[50, 50],
                    batch_size=1,
                    window_width=7,
                    maxSentLength=60,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.00005,
                    update_freq=10,
                    norm_threshold=5.0):

    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size = load_msr_corpus(rootPath + 'vocab.txt',
                                           rootPath + 'tokenized_train.txt',
                                           rootPath + 'tokenized_test.txt',
                                           maxSentLength)
    mtPath = '/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    mt_train, mt_test = load_mts(mtPath + 'concate_15mt_train.txt',
                                 mtPath + 'concate_15mt_test.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]
    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int32')
    indices_train_r = T.cast(indices_train_r, 'int32')
    indices_test_l = T.cast(indices_test_l, 'int32')
    indices_test_r = T.cast(indices_test_r, 'int32')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size))
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    cost_tmp = 0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.imatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.imatrix('x_index_r')
    y = T.ivector('y')
    left_l = T.iscalar()
    right_l = T.iscalar()
    left_r = T.iscalar()
    right_r = T.iscalar()
    length_l = T.iscalar()
    length_r = T.iscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_size[0],
                                                    filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng,
                                    input=layer0_l_input,
                                    image_shape=(batch_size, 1, ishape[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[0], 1, filter_size[0],
                                                  filter_size[1]),
                                    W=conv_W,
                                    b=conv_b)
    layer0_r = Conv_with_input_para(rng,
                                    input=layer0_r_input,
                                    image_shape=(batch_size, 1, ishape[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[0], 1, filter_size[0],
                                                  filter_size[1]),
                                    W=conv_W,
                                    b=conv_b)
    layer0_l_output = debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output = debug_print(layer0_r.output, 'layer0_r.output')
    layer0_para = [conv_W, conv_b]

    layer1 = Average_Pooling(rng,
                             input_l=layer0_l_output,
                             input_r=layer0_r_output,
                             kern=nkerns[0],
                             left_l=left_l,
                             right_l=right_l,
                             left_r=left_r,
                             right_r=right_r,
                             length_l=length_l + filter_size[1] - 1,
                             length_r=length_r + filter_size[1] - 1,
                             dim=maxSentLength + filter_size[1] - 1,
                             window_size=window_width,
                             maxSentLength=maxSentLength)

    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_size[1]))
    layer2_l = Conv_with_input_para(rng,
                                    input=layer1.output_tensor_l,
                                    image_shape=(batch_size, 1, nkerns[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[1], 1, nkerns[0],
                                                  filter_size[1]),
                                    W=conv2_W,
                                    b=conv2_b)
    layer2_r = Conv_with_input_para(rng,
                                    input=layer1.output_tensor_r,
                                    image_shape=(batch_size, 1, nkerns[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[1], 1, nkerns[0],
                                                  filter_size[1]),
                                    W=conv2_W,
                                    b=conv2_b)
    layer2_para = [conv2_W, conv2_b]

    layer3 = Average_Pooling_for_batch1(rng,
                                        input_l=layer2_l.output,
                                        input_r=layer2_r.output,
                                        kern=nkerns[1],
                                        left_l=left_l,
                                        right_l=right_l,
                                        left_r=left_r,
                                        right_r=right_r,
                                        length_l=length_l + filter_size[1] - 1,
                                        length_r=length_r + filter_size[1] - 1,
                                        dim=maxSentLength + filter_size[1] - 1)

    layer3_out = debug_print(layer3.output_simi, 'layer1_out')

    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)

    sum_uni_l = T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r = T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    '''
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1 = 1.0 / (1.0 + EUCLID(sum_uni_l, sum_uni_r))  #25.2%

    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer4_input = T.concatenate(
        [mts, eucli_1, layer1.output_eucli, layer3_out, len_l, len_r],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer4 = LogisticRegression(rng,
                                input=layer4_input,
                                n_in=15 + 3 + 2,
                                n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer4.W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(),
        'L2_reg')  #+(layer1.W** 2).sum()
    cost_this = debug_print(layer4.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')

    test_model = theano.function(
        [index], [layer4.errors(y), layer4.y_pred],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer4.params + layer2_para + layer0_para  # + layer1.params

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        #norm=T.sqrt((grad_i**2).sum())
        #if T.lt(norm_threshold, norm):
        #    print 'big norm'
        #    grad_i=grad_i*(norm_threshold/norm)
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index], [cost, layer4.errors(y), layer4_input],
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer4.errors(y)],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data

        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            if iter % update_freq != 0:
                cost_ij, error_ij = train_model_predict(batch_start)
                #print 'cost_ij: ', cost_ij
                cost_tmp += cost_ij
                error_sum += error_ij
            else:
                cost_average, error_ij, layer3_input = train_model(batch_start)
                #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq)
                error_sum = 0
                cost_tmp = 0  #reset for the next batch
                #print layer3_input
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses = []
                for i in test_batch_start:
                    test_loss, pred_y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                print((
                    '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                    'model %f %%') % (epoch, minibatch_index, n_train_batches,
                                      test_score * 100.))
                '''
                #print 'validating & testing...'
                # compute zero-one loss on validation set
                validation_losses = []
                for i in dev_batch_start:
                    time.sleep(0.5)
                    validation_losses.append(validate_model(i))
                #validation_losses = [validate_model(i) for i in dev_batch_start]
                this_validation_loss = numpy.mean(validation_losses)
                print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss * 100.))
                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)
                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    # test it on the test set
                    test_losses = [test_model(i) for i in test_batch_start]
                    test_score = numpy.mean(test_losses)
                    print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           test_score * 100.))
            '''

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.09,
                    n_epochs=2000,
                    nkerns=[50, 50],
                    batch_size=1,
                    window_width=3,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.00065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=57,
                    max_d_length=59):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/'
    rng = numpy.random.RandomState(23455)
    train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus(
        rootPath + 'vocab.txt', rootPath + 'mc500.train.tsv_standardlized.txt',
        rootPath + 'mc500.test.tsv_standardlized.txt', max_s_length,
        maxSentLength, maxDocLength)  #vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    #     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    #     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    #     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    [
        train_data_D, train_data_Q, train_data_A, train_Y, train_Label,
        train_Length_D, train_Length_D_s, train_Length_Q, train_Length_A,
        train_leftPad_D, train_leftPad_D_s, train_leftPad_Q, train_leftPad_A,
        train_rightPad_D, train_rightPad_D_s, train_rightPad_Q,
        train_rightPad_A
    ] = train_data
    [
        test_data_D, test_data_Q, test_data_A, test_Y, test_Label,
        test_Length_D, test_Length_D_s, test_Length_Q, test_Length_A,
        test_leftPad_D, test_leftPad_D_s, test_leftPad_Q, test_leftPad_A,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    index_Q = T.lvector()
    index_A = T.lvector()
    y = T.lvector()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    len_Q = T.lscalar()
    len_A = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    left_Q = T.lscalar()
    left_A = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    right_Q = T.lscalar()
    right_A = T.lscalar()

    #wmf=T.dmatrix()
    cost_tmp = T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_Q_input = embeddings[index_Q.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A_input = embeddings[index_A.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    #     load_model_for_conv1([conv_W, conv_b])

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_Q = Conv_with_input_para(
        rng,
        input=layer0_Q_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A = Conv_with_input_para(
        rng,
        input=layer0_A_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    layer0_Q_output = debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A_output = debug_print(layer0_A.output, 'layer0_A.output')
    layer0_para = [conv_W, conv_b]

    layer1_DQ = Average_Pooling_Scan(rng,
                                     input_D=layer0_D_output,
                                     input_r=layer0_Q_output,
                                     kern=nkerns[0],
                                     left_D=left_D,
                                     right_D=right_D,
                                     left_D_s=left_D_s,
                                     right_D_s=right_D_s,
                                     left_r=left_Q,
                                     right_r=right_Q,
                                     length_D_s=len_D_s + filter_words[1] - 1,
                                     length_r=len_Q + filter_words[1] - 1,
                                     dim=maxSentLength + filter_words[1] - 1,
                                     doc_len=maxDocLength,
                                     topk=3)
    layer1_DA = Average_Pooling_Scan(rng,
                                     input_D=layer0_D_output,
                                     input_r=layer0_A_output,
                                     kern=nkerns[0],
                                     left_D=left_D,
                                     right_D=right_D,
                                     left_D_s=left_D_s,
                                     right_D_s=right_D_s,
                                     left_r=left_A,
                                     right_r=right_A,
                                     length_D_s=len_D_s + filter_words[1] - 1,
                                     length_r=len_A + filter_words[1] - 1,
                                     dim=maxSentLength + filter_words[1] - 1,
                                     doc_len=maxDocLength,
                                     topk=3)

    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    layer2_DQ = Conv_with_input_para(
        rng,
        input=layer1_DQ.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA = Conv_with_input_para(
        rng,
        input=layer1_DA.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    layer2_Q = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DQ.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_Q_output_sent_rep_Dlevel = debug_print(
        layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A_output_sent_rep_Dlevel = debug_print(
        layer2_A.output_sent_rep_Dlevel, 'layer2_A.output_sent_rep_Dlevel')
    layer2_para = [conv2_W, conv2_b]

    layer3_DQ = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DQ.output,
        input_r=layer2_Q_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA.output,
        input_r=layer2_A_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)

    #high-way
    high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1])
    transform_gate_DQ = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b),
        'transform_gate_DQ')
    transform_gate_DA = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA.output_D_sent_level_rep) + high_b),
        'transform_gate_DA')
    transform_gate_Q = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b),
        'transform_gate_Q')
    transform_gate_A = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA.output_QA_sent_level_rep) + high_b),
        'transform_gate_A')
    highW_para = [high_W, high_b]

    overall_D_Q = debug_print(
        (1.0 - transform_gate_DQ) * layer1_DQ.output_D_sent_level_rep +
        transform_gate_DQ * layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A = (
        1.0 - transform_gate_DA
    ) * layer1_DA.output_D_sent_level_rep + transform_gate_DA * layer3_DA.output_D_doc_level_rep
    overall_Q = (
        1.0 - transform_gate_Q
    ) * layer1_DQ.output_QA_sent_level_rep + transform_gate_Q * layer2_Q.output_sent_rep_Dlevel
    overall_A = (
        1.0 - transform_gate_A
    ) * layer1_DA.output_QA_sent_level_rep + transform_gate_A * layer2_A.output_sent_rep_Dlevel

    simi_sent_level = debug_print(
        cosine(
            layer1_DQ.output_D_sent_level_rep +
            layer1_DA.output_D_sent_level_rep,
            layer1_DQ.output_QA_sent_level_rep +
            layer1_DA.output_QA_sent_level_rep), 'simi_sent_level')
    simi_doc_level = debug_print(
        cosine(
            layer3_DQ.output_D_doc_level_rep +
            layer3_DA.output_D_doc_level_rep,
            layer2_Q.output_sent_rep_Dlevel + layer2_A.output_sent_rep_Dlevel),
        'simi_doc_level')
    simi_overall_level = debug_print(
        cosine(overall_D_Q + overall_D_A, overall_Q + overall_A),
        'simi_overall_level')

    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    layer4_input = debug_print(
        T.concatenate([simi_sent_level, simi_doc_level, simi_overall_level],
                      axis=1),
        'layer4_input')  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer4 = LogisticRegression(rng, input=layer4_input, n_in=3, n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer4.W**2).sum() + (high_W**2).sum() + (conv2_W**2).sum() +
        (conv_W**2).sum(),
        'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this = debug_print(layer4.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    #
    #     [train_data_D, train_data_Q, train_data_A, train_Y, train_Label,
    #                  train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A,
    #                 train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A,
    #                 train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data
    #     [test_data_D, test_data_Q, test_data_A, test_Y, test_Label,
    #                  test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A,
    #                 test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A,
    #                 test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data
    #     index = T.lscalar()
    #     index_D = T.lmatrix()   # now, x is the index matrix, must be integer
    #     index_Q = T.lvector()
    #     index_A= T.lvector()
    #
    #     y = T.lvector()
    #     len_D=T.lscalar()
    #     len_D_s=T.lvector()
    #     len_Q=T.lscalar()
    #     len_A=T.lscalar()
    #
    #     left_D=T.lscalar()
    #     left_D_s=T.lvector()
    #     left_Q=T.lscalar()
    #     left_A=T.lscalar()
    #
    #     right_D=T.lscalar()
    #     right_D_s=T.lvector()
    #     right_Q=T.lscalar()
    #     right_A=T.lscalar()
    #
    #
    #     #wmf=T.dmatrix()
    #     cost_tmp=T.dscalar()

    test_model = theano.function(
        [index],
        [layer4.errors(y), layer4_input, y, layer4.prop_for_posi],
        givens={
            index_D: test_data_D[index],  #a matrix
            index_Q: test_data_Q[index],
            index_A: test_data_A[index],
            y: test_Y[index:index + batch_size],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_Q: test_Length_Q[index],
            len_A: test_Length_A[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_Q: test_leftPad_Q[index],
            left_A: test_leftPad_A[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_Q: test_rightPad_Q[index],
            right_A: test_rightPad_A[index]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer4.params + layer2_para + layer0_para + highW_para

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A: train_data_A[index],
            y: train_Y[index:index + batch_size],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A: train_Length_A[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A: train_leftPad_A[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A: train_rightPad_A[index]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer4.errors(y), layer4_input, y],
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A: train_data_A[index],
            y: train_Y[index:index + batch_size],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A: train_Length_A[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A: train_leftPad_A[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A: train_rightPad_A[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        #         readfile=open('/mounts/data/proj/wenpeng/Dataset/SICK/train_plus_dev.txt', 'r')
        #         train_pairs=[]
        #         train_y=[]
        #         for line in readfile:
        #             tokens=line.strip().split('\t')
        #             listt=tokens[0]+'\t'+tokens[1]
        #             train_pairs.append(listt)
        #             train_y.append(tokens[2])
        #         readfile.close()
        #         writefile=open('/mounts/data/proj/wenpeng/Dataset/SICK/weights_fine_tune.txt', 'w')
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            sys.stdout.write("Training :[%6f] %% complete!\r" %
                             (batch_start * 100.0 / train_size))
            sys.stdout.flush()
            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij

            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(cost_average)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses = []
                test_y = []
                test_features = []
                test_prop = []
                for i in test_batch_start:
                    test_loss, layer3_input, y, posi_prop = test_model(i)
                    test_prop.append(posi_prop[0][0])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc = compute_test_acc(test_y, test_prop)
                #test_acc=1-test_score
                print(
                    ('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') %
                    (epoch, minibatch_index, n_train_batches, test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()
                clf = svm.SVC(
                    kernel='linear'
                )  #OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results = clf.decision_function(test_features)
                lr = linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr = lr.decision_function(test_features)

                acc_svm = compute_test_acc(test_y, results)
                acc_lr = compute_test_acc(test_y, results_lr)

                find_better = False
                if acc_svm > max_acc:
                    max_acc = acc_svm
                    best_epoch = epoch
                    find_better = True
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                if acc_lr > max_acc:
                    max_acc = acc_lr
                    best_epoch = epoch
                    find_better = True
                print '\t\t\tsvm:', acc_svm, 'lr:', acc_lr, 'nn:', test_acc, 'max:', max_acc, '(at', best_epoch, ')'


#                 if find_better==True:
#                     store_model_to_file(layer2_para, best_epoch)
#                     print 'Finished storing best conv params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #6
0
def evaluate_lenet5(claim, title2sentlist, title2wordlist, word2id):
    learning_rate = 0.02
    n_epochs = 100
    emb_size = 300
    batch_size = 1  #50
    filter_size = [3]
    sent_len = 40
    claim_len = 40
    cand_size = 10
    hidden_size = [300, 300]
    max_pred_pick = 5

    # model_options = locals().copy()
    # print("model options", model_options)
    # print('title2sentlist len', len(title2sentlist))
    # print('title2wordlist len', len(title2wordlist))

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    claim_idlist, claim_masklist, sent_ins_ids, sent_ins_mask, sent_cand_list = claim_input_2_theano_input(
        claim, word2id, claim_len, sent_len, cand_size, title2sentlist,
        title2wordlist)

    test_claims = np.asarray([claim_idlist], dtype='int32')
    test_claim_mask = np.asarray([claim_masklist], dtype=theano.config.floatX)

    test_sents = np.asarray([sent_ins_ids], dtype='int32')
    test_sent_masks = np.asarray([sent_ins_mask], dtype=theano.config.floatX)

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    # id2word = {y:x for x,y in word2id.items()}
    # word2vec=load_word2vec()
    # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable
    '''
    the first block for evidence identification in two classes (support & reject)
    the second block for textual entailment: given evidence labels, predict the claim labels
    '''
    sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    sents_mask = T.ftensor3()
    # sents_labels=T.imatrix() #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()

    # joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    # joint_sents_mask=T.ftensor3()
    # # joint_sents_labels=T.imatrix() #(batch, cand_size)
    # joint_claim_ids = T.imatrix() #(batch, claim_len)
    # joint_claim_mask = T.fmatrix()
    # joint_labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)

    "shared parameters"
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    "tasl 1 parameters"
    task1_att_conv_W, task1_att_conv_b = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    task1_conv_W_context, task1_conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    "task 2 parameters"
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W,
        att_conv_b, task1_conv_W_context, conv_W_context
    ]

    conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_sents,
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings = conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1),
                               cand_size,
                               axis=1)
    '''
    attentive conv for task1
    '''
    task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        embed_input_sents,  #batch_size*cand_size, emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=task1_att_conv_W,
        b=task1_att_conv_b,
        W_context=task1_conv_W_context,
        b_context=task1_conv_b_context)
    task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r

    concate_claim_sent = T.concatenate([
        batch_claim_emb, batch_sent_emb,
        T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x')
    ],
                                       axis=2)
    concate_2_matrix = concate_claim_sent.reshape(
        (batch_size * cand_size, hidden_size[0] * 2 + 1))
    "to score each evidence sentence, we use the output of attentiveConv, as well as the output of standard CNN"
    LR_input = T.concatenate([
        concate_2_matrix, task1_attentive_sent_embeddings_l,
        task1_attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2

    # LR_input = concate_2_matrix
    # LR_input_size = hidden_size[0]*2+1
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 1, LR_input_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    binarize_prob = T.where(inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size)
    sents_labels = inter_matrix * binarize_prob
    '''
    training task2, predict 3 labels
    '''
    # joint_embed_input_sents=init_embeddings[joint_sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    # joint_embed_input_claim=init_embeddings[joint_claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1)
    # joint_conv_model_sents = Conv_with_Mask(rng, input_tensor3=joint_embed_input_sents,
    #          mask_matrix = joint_sents_mask.reshape((joint_sents_mask.shape[0]*joint_sents_mask.shape[1],joint_sents_mask.shape[2])),
    #          image_shape=(batch_size*cand_size, 1, emb_size, sent_len),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    # joint_sent_embeddings=joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    # joint_batch_sent_emb = joint_sent_embeddings.reshape((batch_size, cand_size, hidden_size[0]))
    # "??? use joint_sents_labels means the evidence labels are not provided by task 1?"
    # joint_premise_emb = T.sum(joint_batch_sent_emb*joint_sents_labels.dimshuffle(0,1,'x'), axis=1) #(batch, hidden_size)

    premise_emb = T.sum(batch_sent_emb * sents_labels.dimshuffle(0, 1, 'x'),
                        axis=1)

    # joint_conv_model_claims = Conv_with_Mask(rng, input_tensor3=joint_embed_input_claim,
    #          mask_matrix = joint_claim_mask,
    #          image_shape=(batch_size, 1, emb_size, claim_len),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    # joint_claim_embeddings=joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    premise_hypo_emb = T.concatenate([premise_emb, claim_embeddings],
                                     axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(
        0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    sents_dot_2_matrix = T.nnet.softmax(
        sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    sents_context = T.batched_dot(
        sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    add_sents_context = embed_input_sents + sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    masked_sents_attconv = attentive_sent_embeddings_l * sents_labels.dimshuffle(
        0, 1, 'x')
    masked_claim_attconv = attentive_sent_embeddings_r * sents_labels.dimshuffle(
        0, 1, 'x')
    fine_max = T.concatenate([
        T.max(masked_sents_attconv, axis=1),
        T.max(masked_claim_attconv, axis=1)
    ],
                             axis=1)  #(batch, 2*hidden)
    # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([premise_hypo_emb, fine_max], axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    # joint_loss=joint_layer_LR.negative_log_likelihood(joint_labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [init_embeddings] + NN_para + LR_para + joint_LR_para
    print('initialze model parameters...')
    load_model_from_file(
        '/home1/w/wenpeng/dataset/FEVER/model_para_0.9936287838053803', params)

    # train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask,joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids, sents_mask, claim_ids, claim_mask],
        [inter_matrix, binarize_prob, joint_layer_LR.y_pred],
        allow_input_downcast=True,
        on_unused_input='ignore')
    # dev_model = theano.function([sents_ids,sents_mask, claim_ids,claim_mask], [binarize_prob,joint_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print('... testing')
    # early-stopping parameters

    batch_score_vec, batch_binary_vec, pred_i = test_model(
        test_sents, test_sent_masks, test_claims, test_claim_mask)
    sorted_indices = np.argsort(batch_score_vec[0])[::-1]  #descending order
    selected_sents = []
    for index in sorted_indices:
        if batch_binary_vec[0][index] == 1:
            selected_sents.append(sent_cand_list[index])
            if len(selected_sents) == 5:
                break

    # for i, indicator in enumerate(list(batch_binary_vec[0])):
    #     if indicator == 1:
    #         selected_sents.append(sent_cand_list[i])
    return pred_id2label.get(
        pred_i[0]) + '"<p>"' + '"<br />"'.join(selected_sents) + '"<p/>"'
Example #7
0
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=2000,
                    word_nkerns=300,
                    batch_size=1,
                    window_width=[3, 3],
                    emb_size=300,
                    margin=0.5,
                    L2_weight=0.0003,
                    Div_reg=0.03,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40,
                    max_relation_len=6,
                    max_Q_len=30,
                    neg_all=100,
                    train_size=69967,
                    test_size=19953,
                    mark='_RC_newdata'):  #train_size=75909, test_size=17386
    #     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/home/wyin/Datasets/SimpleQuestions_v2/relation_classification/'
    triple_files = [
        'train.replace_ne.withpoolwenpengFormat.txt',
        'test.replace_ne.withpoolwenpengFormat.txt'
    ]

    rng = numpy.random.RandomState(23455)
    datasets, datasets_test, length_per_example_train, length_per_example_test, vocab_size = load_train(
        triple_files[0], triple_files[1], max_relation_len, max_Q_len,
        train_size, test_size,
        mark)  #max_char_len, max_des_len, max_relation_len, max_Q_len

    print 'vocab_size:', vocab_size

    train_data = datasets
    #     valid_data=datasets[1]
    test_data = datasets_test
    #     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
    #

    train_relations = train_data[0]
    train_relation_lengths = train_data[1]
    train_remainQ_word_ids = train_data[2]
    train_remainQ_word_len = train_data[3]

    test_relations = test_data[0]
    test_relation_lengths = test_data[1]
    test_remainQ_word_ids = test_data[2]
    test_remainQ_word_len = test_data[3]

    train_sizes = [
        len(train_relations),
        len(train_relation_lengths),
        len(train_remainQ_word_ids),
        len(train_remainQ_word_len)
    ]
    if sum(train_sizes) / len(train_sizes) != train_size:
        print 'weird size:', train_sizes
        exit(0)

    test_sizes = [
        len(test_relations),
        len(test_relation_lengths),
        len(test_remainQ_word_ids),
        len(test_remainQ_word_len)
    ]
    if sum(test_sizes) / len(test_sizes) != test_size:
        print 'weird size:', test_sizes
        exit(0)

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    #     indices_train_pos_entity_char=theano.shared(numpy.asarray(train_pos_entity_char, dtype='int32'), borrow=True)
    #     indices_train_pos_entity_des=theano.shared(numpy.asarray(train_pos_entity_des, dtype='int32'), borrow=True)
    #     indices_train_relations=theano.shared(numpy.asarray(train_relations, dtype='int32'), borrow=True)
    #     indices_train_entity_char_lengths=theano.shared(numpy.asarray(train_entity_char_lengths, dtype='int32'), borrow=True)
    #     indices_train_entity_des_lengths=theano.shared(numpy.asarray(train_entity_des_lengths, dtype='int32'), borrow=True)
    #     indices_train_relation_lengths=theano.shared(numpy.asarray(train_relation_lengths, dtype='int32'), borrow=True)
    #     indices_train_mention_char_ids=theano.shared(numpy.asarray(train_mention_char_ids, dtype='int32'), borrow=True)
    #     indices_train_remainQ_word_ids=theano.shared(numpy.asarray(train_remainQ_word_ids, dtype='int32'), borrow=True)
    #     indices_train_mention_char_lens=theano.shared(numpy.asarray(train_mention_char_lens, dtype='int32'), borrow=True)
    #     indices_train_remainQ_word_len=theano.shared(numpy.asarray(train_remainQ_word_len, dtype='int32'), borrow=True)
    #     indices_train_entity_scores=theano.shared(numpy.asarray(train_entity_scores, dtype=theano.config.floatX), borrow=True)

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    #     rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    # allocate symbolic variables for the data
    index = T.iscalar()
    rel_word_ids_M = T.imatrix()
    rel_word_lens_M = T.imatrix()
    q_word_ids_f = T.ivector()
    q_word_lens_f = T.ivector()

    filter_size = (emb_size, window_width[0])

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1])
    q_rel_conv_W, q_rel_conv_b = create_conv_para(
        rng, filter_shape=word_filter_shape)
    params = [embeddings, q_rel_conv_W, q_rel_conv_b]
    q_rel_conv_W_into_matrix = q_rel_conv_W.reshape(
        (q_rel_conv_W.shape[0], q_rel_conv_W.shape[2] * q_rel_conv_W.shape[3]))

    #     load_model_from_file(rootPath, params, '')

    def SimpleQ_matches_Triple(rel_word_ids_f, rel_word_lens_f):
        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape(
            (batch_size, max_relation_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape(
            (batch_size, max_Q_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        #q-rel
        q_rel_conv = Conv_with_input_para(rng,
                                          input=q_word_input,
                                          image_shape=(batch_size, 1, emb_size,
                                                       max_Q_len),
                                          filter_shape=word_filter_shape,
                                          W=q_rel_conv_W,
                                          b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng,
                                        input=rel_word_input,
                                        image_shape=(batch_size, 1, emb_size,
                                                     max_relation_len),
                                        filter_shape=word_filter_shape,
                                        W=q_rel_conv_W,
                                        b=q_rel_conv_b)

        #         q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool = Max_Pooling(rng,
                                    input_l=rel_conv.output,
                                    left_l=rel_word_lens_f[0],
                                    right_l=rel_word_lens_f[2])
        q_rel_pool = Average_Pooling_for_SimpleQA(
            rng,
            input_l=q_rel_conv.output,
            input_r=rel_conv_pool.output_maxpooling,
            left_l=q_word_lens_f[0],
            right_l=q_word_lens_f[2],
            length_l=q_word_lens_f[1] + filter_size[1] - 1,
            dim=max_Q_len + filter_size[1] - 1,
            topk=2)

        overall_simi = cosine(q_rel_pool.output_maxpooling,
                              rel_conv_pool.output_maxpooling)
        return overall_simi

    simi_list, updates = theano.scan(
        SimpleQ_matches_Triple, sequences=[rel_word_ids_M, rel_word_lens_M])

    posi_simi = simi_list[0]
    nega_simies = simi_list[1:]
    loss_simi_list = T.maximum(
        0.0, margin - posi_simi.reshape((1, 1)) + nega_simies)
    loss_simi = T.sum(loss_simi_list)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (embeddings**2).sum() + (q_rel_conv_W**2).sum(),
        'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg = Diversify_Reg(q_rel_conv_W_into_matrix)
    cost = loss_simi + L2_weight * L2_reg + Div_reg * diversify_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f],
        [loss_simi, simi_list],
        on_unused_input='ignore')

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        #         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        #         updates.append((acc_i, acc))
        if param_i == embeddings:
            updates.append(
                (param_i,
                 T.set_subtensor(
                     (param_i -
                      learning_rate * grad_i / T.sqrt(acc + 1e-10))[0],
                     theano.shared(numpy.zeros(emb_size)))))  #Ada
        else:
            updates.append(
                (param_i, param_i -
                 learning_rate * grad_i / T.sqrt(acc + 1e-10)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f],
        [loss_simi, cost],
        updates=updates,
        on_unused_input='ignore')

    #     train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates,
    #           givens={
    #             rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)),
    #             rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
    #             q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)),
    #             q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3))
    #
    #             }, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    best_test_accu = 0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0

        for jj in range(train_size):
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #print batch_start
            train_rel_word_ids_M = numpy.asarray(
                train_relations[jj], dtype='int32').reshape(
                    (length_per_example_train[jj], max_relation_len))
            train_rel_word_lens_M = numpy.asarray(
                train_relation_lengths[jj], dtype='int32').reshape(
                    (length_per_example_train[jj], 3))
            train_q_word_ids_M = numpy.asarray(
                train_remainQ_word_ids[jj], dtype='int32'
            )  #.reshape((length_per_example_train[jj], max_Q_len))
            train_q_word_lens_M = numpy.asarray(
                train_remainQ_word_len[jj],
                dtype='int32')  #.reshape((length_per_example_train[jj], 3))
            loss_simi_i, cost_i = train_model(train_rel_word_ids_M,
                                              train_rel_word_lens_M,
                                              train_q_word_ids_M,
                                              train_q_word_lens_M)

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + '\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)
#
            if iter > 59999 and iter % 10000 == 0:

                test_loss = []
                succ = 0
                for i in range(test_size):
                    #                     print 'testing', i, '...'
                    #prepare data
                    test_rel_word_ids_M = numpy.asarray(
                        test_relations[i], dtype='int32').reshape(
                            (length_per_example_test[i], max_relation_len))
                    test_rel_word_lens_M = numpy.asarray(
                        test_relation_lengths[i], dtype='int32').reshape(
                            (length_per_example_test[i], 3))
                    test_q_word_ids_M = numpy.asarray(
                        test_remainQ_word_ids[i], dtype='int32'
                    )  #.reshape((length_per_example_test[i], max_Q_len))
                    test_q_word_lens_M = numpy.asarray(
                        test_remainQ_word_len[i], dtype='int32'
                    )  #.reshape((length_per_example_test[i], 3))
                    loss_simi_i, simi_list_i = test_model(
                        test_rel_word_ids_M, test_rel_word_lens_M,
                        test_q_word_ids_M, test_q_word_lens_M)
                    #                     print 'simi_list_i:', simi_list_i[:10]
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0] >= max(simi_list_i[1:]):
                        succ += 1


#                     print 'testing', i, '...acc:', succ*1.0/(i+1)
                succ = (succ + 20610 - test_size) * 1.0 / 20610
                #now, check MAP and MRR
                print((
                    '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                    'model %f') %
                      (epoch, minibatch_index, n_train_batches, succ))

                if best_test_accu < succ:
                    best_test_accu = succ
                    store_model_to_file(rootPath, params, mark)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, word_nkerns=500, char_nkerns=100, batch_size=1, window_width=3,
                    emb_size=500, char_emb_size=100, hidden_size=200,
                    margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40, 
                    max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=6, 
                    neg_all=100, train_size=75893, test_size=19168, mark='_BiasedMaxPool_lr0.1_word500_char100_noDes_ent2.0'):  #train_size=75909, test_size=17386
#     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files=['annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.fromMo_FB5M.txt']

    rng = numpy.random.RandomState(23455)
    word2id, char2id=load_word2id_char2id(mark)
#     datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_test_or_valid(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size)#max_char_len, max_des_len, max_relation_len, max_Q_len

    datasets_test, length_per_example_test, word2id, char2id  = load_test_or_valid(triple_files[1], char2id, word2id, max_char_len, max_des_len, max_relation_len, max_Q_len, test_size)
    vocab_size=len(word2id)
    char_size=len(char2id)
    print 'vocab_size:', vocab_size, 'char_size:', char_size

#     train_data=datasets
#     valid_data=datasets[1]
    test_data=datasets_test
#     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
#     
#     train_pos_entity_char=train_data[0]
#     train_pos_entity_des=train_data[1]
#     train_relations=train_data[2]
#     train_entity_char_lengths=train_data[3]
#     train_entity_des_lengths=train_data[4]
#     train_relation_lengths=train_data[5]
#     train_mention_char_ids=train_data[6]
#     train_remainQ_word_ids=train_data[7]
#     train_mention_char_lens=train_data[8]
#     train_remainQ_word_len=train_data[9]
#     train_entity_scores=train_data[10]

    test_pos_entity_char=test_data[0]
#    test_pos_entity_des=test_data[1]
    test_relations=test_data[2]
    test_entity_char_lengths=test_data[3]
#    test_entity_des_lengths=test_data[4]
    test_relation_lengths=test_data[5]
    test_mention_char_ids=test_data[6]
    test_remainQ_word_ids=test_data[7]
    test_mention_char_lens=test_data[8]
    test_remainQ_word_len=test_data[9]
    test_entity_scores=test_data[10]
# 
#     test_pos_entity_char=test_data[0]       #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
#     test_pos_entity_des=test_data[1]        #matrix, each row for a examle: 20*2*51
#     test_relations=test_data[2]             #matrix, each row for a example: 5*51
#     test_entity_char_lengths=test_data[3]   #matrix, each row for a example: 3*2*51  (three valies for one entity)
#     test_entity_des_lengths=test_data[4]    #matrix, each row for a example: 3*2*51  (three values for one entity)
#     test_relation_lengths=test_data[5]      #matrix, each row for a example: 3*51
#     test_mention_char_ids=test_data[6]      #matrix, each row for a mention: 40
#     test_remainQ_word_ids=test_data[7]      #matrix, each row for a question: 30
#     test_mention_char_lens=test_data[8]     #matrix, each three values for a mention: 3
#     test_remainQ_word_len=test_data[9]      #matrix, each three values for a remain question: 3
    

#     train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
#            len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)]
#     if sum(train_sizes)/len(train_sizes)!=train_size:
#         print 'weird size:', train_sizes
#         exit(0)

    test_sizes=[len(test_pos_entity_char), len(test_relations), len(test_entity_char_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)]
    if sum(test_sizes)/len(test_sizes)!=test_size:
        print 'weird size:', test_sizes
        exit(0)

#     n_train_batches=train_size/batch_size
#     n_test_batches=test_size/batch_size
    
#     train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
#     test_batch_start=list(numpy.arange(n_test_batches)*batch_size)
    
#     indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char)
#     indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des)
#     indices_train_relations=pythonList_into_theanoIntMatrix(train_relations)
#     indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths)
#     indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths)
#     indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths)
#     indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids)
#     indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids)
#     indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens)
#     indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len)   
#     indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) 
    
#     indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char)
#     indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des)
#     indices_test_relations=pythonList_into_theanoIntMatrix(test_relations)
#     indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths)
#     indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths)
#     indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths)
#     indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids)
#     indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids)
#     indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens)
#     indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len)   
#     indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores)

    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
#     rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      

    char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#     char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
    index = T.iscalar()
    chosed_indices=T.ivector()
    
    ent_char_ids_M = T.imatrix()   
    ent_lens_M = T.imatrix()
    men_char_ids_M = T.imatrix()  
    men_lens_M=T.imatrix()
    rel_word_ids_M=T.imatrix()
    rel_word_lens_M=T.imatrix()
    #desH_word_ids_M=T.imatrix()
    #desH_word_lens_M=T.imatrix()
    q_word_ids_M=T.imatrix()
    q_word_lens_M=T.imatrix()
    ent_scores=T.fvector()

    
    filter_size=(emb_size,window_width)
    char_filter_size=(char_emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1])
    word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    #q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b]#, q_desH_conv_W, q_desH_conv_b]
    load_model_from_file(rootPath, params, mark)

    def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f,
                       men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f):
        

#         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        #desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
#         desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
        #ent_mention
        ent_char_conv = Conv_with_input_para(rng, input=ent_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng, input=men_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng, input=rel_word_input,
                image_shape=(batch_size, 1, emb_size, max_relation_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        #q_desH
        #q_desH_conv = Conv_with_input_para(rng, input=q_word_input,
        #        image_shape=(batch_size, 1, emb_size, max_Q_len),
        #        filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
        #desH_conv = Conv_with_input_para(rng, input=desH_word_input,
        #        image_shape=(batch_size, 1, emb_size, max_des_len),
        #        filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
        
        ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2])
        men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2])
        
        #q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2])
        q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling,
                                                left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1,
                                                dim=max_Q_len+filter_size[1]-1, topk=2)
        

        #q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        #desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2])
        

        overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)*0.33333+\
                    cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)*0.55
         #           0.0*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling)
#                     cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi
    
    simi_list, updates = theano.scan(
        SimpleQ_matches_Triple,
                sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M,
                   men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M])
    
    simi_list+=0.2*ent_scores
    
    posi_simi=simi_list[0]
    nega_simies=simi_list[1:]
    loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) 
    loss_simi=T.sum(loss_simi_list)

    




    test_model = theano.function([ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M,
                                  q_word_ids_M, q_word_lens_M, ent_scores], [loss_simi, simi_list],on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... testing'

    start_time = time.clock()
    mid_time = start_time

    epoch = 0



                 
    test_loss=[]
    succ=0
    for i in range(test_size):
        
        #prepare data
        test_ent_char_ids_M= numpy.asarray(test_pos_entity_char[i], dtype='int32').reshape((length_per_example_test[i], max_char_len))  
        test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3))
        test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int32').reshape((length_per_example_test[i], max_char_len))
        test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int32').reshape((length_per_example_test[i], 3))
        test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int32').reshape((length_per_example_test[i], max_relation_len))  
        test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3))
        #test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int32').reshape((length_per_example_test[i], max_des_len))
        #test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3))
        test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int32').reshape((length_per_example_test[i], max_Q_len))
        test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int32').reshape((length_per_example_test[i], 3))
        test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX)
    
    
    
    
                    
        loss_simi_i,simi_list_i=test_model(test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M,
                                           test_q_word_ids_M, test_q_word_lens_M, test_ent_scores)
    #                     print 'simi_list_i:', simi_list_i[:10]
        test_loss.append(loss_simi_i)
        if len(simi_list_i)==1 or simi_list_i[0]>=max(simi_list_i[1:]):
            succ+=1
        if i%1000==0:
            print 'testing', i, '...acc:', (succ*1.0/(i+1))*(19168*1.0/21687)
    succ=succ*100.0/21687
    #now, check MAP and MRR
    print 'accu:', succ
    

#     store_model_to_file(rootPath, params, succ, mark)

    print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[50, 50], batch_size=1, window_width=7,
                    maxSentLength=60, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.00005, update_freq=10, norm_threshold=5.0):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_msr_corpus(rootPath+'vocab.txt', rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int32')
    indices_train_r=T.cast(indices_train_r, 'int32')
    indices_test_l=T.cast(indices_test_l, 'int32')
    indices_test_r=T.cast(indices_test_r, 'int32')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size))
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.imatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.imatrix('x_index_r')
    y = T.ivector('y')  
    left_l=T.iscalar()
    right_l=T.iscalar()
    left_r=T.iscalar()
    right_r=T.iscalar()
    length_l=T.iscalar()
    length_r=T.iscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    mts=T.dmatrix()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    layer0_para=[conv_W, conv_b]    
    
    layer1=Average_Pooling(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                     left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                      length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                       dim=maxSentLength+filter_size[1]-1, window_size=window_width, maxSentLength=maxSentLength)
    
    conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]))
    layer2_l = Conv_with_input_para(rng, input=layer1.output_tensor_l,
            image_shape=(batch_size, 1, nkerns[0], ishape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b)
    layer2_r = Conv_with_input_para(rng, input=layer1.output_tensor_r,
            image_shape=(batch_size, 1, nkerns[0], ishape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_size[1]), W=conv2_W, b=conv2_b)
    layer2_para=[conv2_W, conv2_b]
        
    layer3=Average_Pooling_for_batch1(rng, input_l=layer2_l.output, input_r=layer2_r.output, kern=nkerns[1],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    
    layer3_out=debug_print(layer3.output_simi, 'layer1_out')
    
    
    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    
    
    sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    '''
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))    
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer4_input=T.concatenate([mts, eucli_1,layer1.output_eucli, layer3_out,len_l, len_r], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer4=LogisticRegression(rng, input=layer4_input, n_in=15+3+2, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer4.W** 2).sum()+(conv2_W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
    cost_this =debug_print(layer4.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    

    
    test_model = theano.function([index], [layer4.errors(y), layer4.y_pred],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index: index + batch_size]}, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer4.params+ layer2_para+ layer0_para# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        #norm=T.sqrt((grad_i**2).sum())
        #if T.lt(norm_threshold, norm):
        #    print 'big norm'
        #    grad_i=grad_i*(norm_threshold/norm)
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([index], [cost,layer4.errors(y), layer4_input], updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size]}, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer4.errors(y)],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            if iter%update_freq != 0:
                cost_ij, error_ij=train_model_predict(batch_start)
                #print 'cost_ij: ', cost_ij
                cost_tmp+=cost_ij
                error_sum+=error_ij
            else:
                cost_average, error_ij, layer3_input= train_model(batch_start)
                #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq)
                error_sum=0
                cost_tmp=0#reset for the next batch
                #print layer3_input
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                for i in test_batch_start:
                    test_loss, pred_y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           test_score * 100.))
                '''
                #print 'validating & testing...'
                # compute zero-one loss on validation set
                validation_losses = []
                for i in dev_batch_start:
                    time.sleep(0.5)
                    validation_losses.append(validate_model(i))
                #validation_losses = [validate_model(i) for i in dev_batch_start]
                this_validation_loss = numpy.mean(validation_losses)
                print('\t\tepoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index , n_train_batches, \
                       this_validation_loss * 100.))
                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)
                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    # test it on the test set
                    test_losses = [test_model(i) for i in test_batch_start]
                    test_score = numpy.mean(test_losses)
                    print(('\t\t\t\tepoch %i, minibatch %i/%i, test error of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           test_score * 100.))
            '''

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #10
0
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
        
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]))
    layer0_para=[conv_W, conv_b] 
    conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
    layer2_para=[conv2_W, conv2_b]
    high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para=[high_W, high_b]
    params = layer2_para+layer0_para+highW_para#+[embeddings]

    layer0_D = Conv_with_input_para(rng, input=layer0_D_input,
            image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A1 = Conv_with_input_para(rng, input=layer0_A1_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
Example #11
0
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=20, cand_size=10,hidden_size=[300,300], max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, _, word2id  = load_fever_train(sent_len, claim_len, cand_size)
    test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names,test_ground_names,_, word2id = load_fever_dev(sent_len, claim_len, cand_size, word2id)

    train_sents=np.asarray(train_sents, dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    test_sents=np.asarray(test_sents, dtype='int32')

    train_sent_masks=np.asarray(train_sent_masks, dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_sent_masks=np.asarray(test_sent_masks, dtype=theano.config.floatX)

    train_sent_labels=np.asarray(train_sent_labels, dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sent_labels=np.asarray(test_sent_labels, dtype='int32')

    train_claims=np.asarray(train_claims, dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    test_claims=np.asarray(test_claims, dtype='int32')

    train_claim_mask=np.asarray(train_claim_mask, dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_claim_mask=np.asarray(test_claim_mask, dtype=theano.config.floatX)


    # train_labels_store=np.asarray(all_labels[0], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[2], dtype='int32')

    train_size=len(train_claims)
    # dev_size=len(dev_labels_store)
    test_size=len(test_claims)
    print 'train size: ', train_size, ' test size: ', test_size

    vocab_size=len(word2id)+1

    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable


    "now, start to build the input form of the model"
    sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    sents_mask=T.ftensor3()
    sents_labels=T.imatrix() #(batch, cand_size)
    claim_ids = T.imatrix() #(batch, claim_len)
    claim_mask = T.imatrix()
    # labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'


    embed_input_sents=init_embeddings[sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim=init_embeddings[claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1)



    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    # conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]))
    NN_para = [conv_W, conv_b]
    conv_model_sents = Conv_with_Mask(rng, input_tensor3=embed_input_sents,
             mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])),
             image_shape=(batch_size*cand_size, 1, emb_size, sent_len),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings=conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape((batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(rng, input_tensor3=embed_input_claim,
             mask_matrix = claim_mask,
             image_shape=(batch_size, 1, emb_size, claim_len),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings=conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0,'x', 1), cand_size, axis=1)

    concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb ], axis=2)
    concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2))

    LR_input = concate_2_matrix#T.concatenate([sent_embeddings,sent_embeddings2], axis=1)
    LR_input_size = hidden_size[0]*2
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(rng, 1, LR_input_size) # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(concate_2_matrix.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix)
    loss = -T.mean(T.log(prob_pos))

    #
    # "Logistic Regression layer"
    # LR_input = T.concatenate([attentive_sent_embeddings_l,attentive_sent_embeddings_r,attentive_sent_embeddings_l+attentive_sent_embeddings_r,attentive_sent_embeddings_l*attentive_sent_embeddings_r],axis=1)
    # LR_input_size=4*hidden_size[0]
    #
    # U_a = create_ensemble_para(rng, 3, LR_input_size) # (input_size, 3)
    # LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    # LR_para=[U_a, LR_b]
    #
    # layer_LR=LogisticRegression(rng, input=normalize_matrix_col_wise(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    # loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    '''
    testing
    '''
    binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size



    params = [init_embeddings]+NN_para+LR_para
    cost=loss
    "Use AdaGrad to update parameters"
    updates =   Gradient_Cost_Para(cost,params, learning_rate)


    train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_ids,sents_mask,claim_ids,claim_mask], inter_matrix, allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]


    max_acc_dev=0.0
    max_test_f1=0.0

    cost_i=0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu=0

        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]
            '''
            train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask
            sents_ids,sents_mask,sents_labels,claim_ids,claim_mask
            '''
            cost_i+= train_model(
                                train_sents[train_id_batch],
                                train_sent_masks[train_id_batch],
                                train_sent_labels[train_id_batch],
                                train_claims[train_id_batch],
                                train_claim_mask[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()



                '''
                test
                test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask,
                sents_ids,sents_mask,claim_ids,claim_mask
                '''
                f1_sum=0.0
                for test_batch_id in test_batch_start: # for each test batch
                    batch_prob=test_model(
                            test_sents[test_batch_id:test_batch_id+batch_size],
                            test_sent_masks[test_batch_id:test_batch_id+batch_size],
                            test_claims[test_batch_id:test_batch_id+batch_size],
                            test_claim_mask[test_batch_id:test_batch_id+batch_size])

                    batch_sent_labels = test_sent_labels[test_batch_id:test_batch_id+batch_size]
                    batch_sent_names = test_sent_names[test_batch_id:test_batch_id+batch_size]
                    batch_ground_names = test_ground_names[test_batch_id:test_batch_id+batch_size]


                    for i in range(batch_size):
                        pred_sent_names = []
                        gold_sent_names = batch_ground_names[i]

                        zipped=[(batch_prob[i,k],batch_sent_labels[i][k],batch_sent_names[i][k]) for k in range(cand_size)]
                        sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True)
                        # print 'sorted_zip:', sorted_zip
                        # exit(0)
                        for j in range(cand_size):
                            triple = sorted_zip[j]
                            if triple[1] == 1.0:
                                '''
                                we should consider a rank, instead of binary
                                '''
                                if triple[0] >0.5:
                                    pred_sent_names.append(batch_sent_names[i][j])
                                    if len(pred_sent_names) == max_pred_pick:
                                        break
                        f1_i = compute_f1_two_list_names(pred_sent_names, gold_sent_names)
                        f1_sum+=f1_i


                test_f1=f1_sum/(len(test_batch_start)*batch_size)

                if test_f1 > max_test_f1:
                    max_test_f1=test_f1
                print '\t\tcurrent test_f1:', test_f1,' ; ','\t\t\t\t\tmax_test_f1:', max_test_f1



        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Example #12
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    word_nkerns=50,
                    char_nkerns=4,
                    batch_size=1,
                    window_width=[2, 5],
                    emb_size=50,
                    char_emb_size=4,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0003,
                    Div_reg=0.03,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40,
                    max_char_len=40,
                    max_des_len=20,
                    max_relation_len=5,
                    max_Q_len=30,
                    train_neg_size=21,
                    neg_all=100,
                    train_size=200,
                    test_size=200,
                    mark='_forfun'):  #train_size=75909, test_size=17386
    #     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files = [
        'annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt',
        'annotated_fb_data_test.entitylinking.top20_succSet_asInput.txt'
    ]

    rng = numpy.random.RandomState(23455)
    datasets, datasets_test, length_per_example_test, vocab_size, char_size = load_train(
        triple_files[0], triple_files[1], max_char_len, max_des_len,
        max_relation_len, max_Q_len, train_size, test_size,
        mark)  #max_char_len, max_des_len, max_relation_len, max_Q_len

    print 'vocab_size:', vocab_size, 'char_size:', char_size

    train_data = datasets
    #     valid_data=datasets[1]
    test_data = datasets_test
    #     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
    #
    train_pos_entity_char = train_data[0]
    train_pos_entity_des = train_data[1]
    train_relations = train_data[2]
    train_entity_char_lengths = train_data[3]
    train_entity_des_lengths = train_data[4]
    train_relation_lengths = train_data[5]
    train_mention_char_ids = train_data[6]
    train_remainQ_word_ids = train_data[7]
    train_mention_char_lens = train_data[8]
    train_remainQ_word_len = train_data[9]
    train_entity_scores = train_data[10]

    test_pos_entity_char = test_data[0]
    test_pos_entity_des = test_data[1]
    test_relations = test_data[2]
    test_entity_char_lengths = test_data[3]
    test_entity_des_lengths = test_data[4]
    test_relation_lengths = test_data[5]
    test_mention_char_ids = test_data[6]
    test_remainQ_word_ids = test_data[7]
    test_mention_char_lens = test_data[8]
    test_remainQ_word_len = test_data[9]
    test_entity_scores = test_data[10]
    #
    #     test_pos_entity_char=test_data[0]       #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
    #     test_pos_entity_des=test_data[1]        #matrix, each row for a examle: 20*2*51
    #     test_relations=test_data[2]             #matrix, each row for a example: 5*51
    #     test_entity_char_lengths=test_data[3]   #matrix, each row for a example: 3*2*51  (three valies for one entity)
    #     test_entity_des_lengths=test_data[4]    #matrix, each row for a example: 3*2*51  (three values for one entity)
    #     test_relation_lengths=test_data[5]      #matrix, each row for a example: 3*51
    #     test_mention_char_ids=test_data[6]      #matrix, each row for a mention: 40
    #     test_remainQ_word_ids=test_data[7]      #matrix, each row for a question: 30
    #     test_mention_char_lens=test_data[8]     #matrix, each three values for a mention: 3
    #     test_remainQ_word_len=test_data[9]      #matrix, each three values for a remain question: 3


    train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
           len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)]
    if sum(train_sizes) / len(train_sizes) != train_size:
        print 'weird size:', train_sizes
        exit(0)

    test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)]
    if sum(test_sizes) / len(test_sizes) != test_size:
        print 'weird size:', test_sizes
        exit(0)

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_pos_entity_char = pythonList_into_theanoIntMatrix(
        train_pos_entity_char)
    indices_train_pos_entity_des = pythonList_into_theanoIntMatrix(
        train_pos_entity_des)
    indices_train_relations = pythonList_into_theanoIntMatrix(train_relations)
    indices_train_entity_char_lengths = pythonList_into_theanoIntMatrix(
        train_entity_char_lengths)
    indices_train_entity_des_lengths = pythonList_into_theanoIntMatrix(
        train_entity_des_lengths)
    indices_train_relation_lengths = pythonList_into_theanoIntMatrix(
        train_relation_lengths)
    indices_train_mention_char_ids = pythonList_into_theanoIntMatrix(
        train_mention_char_ids)
    indices_train_remainQ_word_ids = pythonList_into_theanoIntMatrix(
        train_remainQ_word_ids)
    indices_train_mention_char_lens = pythonList_into_theanoIntMatrix(
        train_mention_char_lens)
    indices_train_remainQ_word_len = pythonList_into_theanoIntMatrix(
        train_remainQ_word_len)
    indices_train_entity_scores = pythonList_into_theanoFloatMatrix(
        train_entity_scores)

    #     indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char)
    #     indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des)
    #     indices_test_relations=pythonList_into_theanoIntMatrix(test_relations)
    #     indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths)
    #     indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths)
    #     indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths)
    #     indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids)
    #     indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids)
    #     indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens)
    #     indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len)
    #     indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores)

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'word_emb' + mark + '.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    char_rand_values = random_value_normal((char_size + 1, char_emb_size),
                                           theano.config.floatX,
                                           numpy.random.RandomState(1234))
    char_rand_values[0] = numpy.array(numpy.zeros(char_emb_size),
                                      dtype=theano.config.floatX)
    char_embeddings = theano.shared(value=char_rand_values, borrow=True)

    # allocate symbolic variables for the data
    index = T.lscalar()
    chosed_indices = T.lvector()

    ent_char_ids_M = T.lmatrix()
    ent_lens_M = T.lmatrix()
    men_char_ids_M = T.lmatrix()
    men_lens_M = T.lmatrix()
    rel_word_ids_M = T.lmatrix()
    rel_word_lens_M = T.lmatrix()
    desH_word_ids_M = T.lmatrix()
    desH_word_lens_M = T.lmatrix()
    #     desT_word_ids_M=T.lmatrix()
    #     desT_word_lens_M=T.lmatrix()
    q_word_ids_M = T.lmatrix()
    q_word_lens_M = T.lmatrix()
    ent_scores = T.dvector()

    #max_char_len, max_des_len, max_relation_len, max_Q_len
    #     ent_men_ishape = (char_emb_size, max_char_len)  # this is the size of MNIST images
    #     rel_ishape=(emb_size, max_relation_len)
    #     des_ishape=(emb_size, max_des_len)
    #     q_ishape=(emb_size, max_Q_len)

    filter_size = (emb_size, window_width[0])
    char_filter_size = (char_emb_size, window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    char_filter_shape = (char_nkerns, 1, char_filter_size[0],
                         char_filter_size[1])
    word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b = create_conv_para(rng,
                                                filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b = create_conv_para(
        rng, filter_shape=word_filter_shape)
    q_desH_conv_W, q_desH_conv_b = create_conv_para(
        rng, filter_shape=word_filter_shape)
    params = [
        char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W,
        q_rel_conv_b, q_desH_conv_W, q_desH_conv_b
    ]
    char_conv_W_into_matrix = char_conv_W.reshape(
        (char_conv_W.shape[0], char_conv_W.shape[2] * char_conv_W.shape[3]))
    q_rel_conv_W_into_matrix = q_rel_conv_W.reshape(
        (q_rel_conv_W.shape[0], q_rel_conv_W.shape[2] * q_rel_conv_W.shape[3]))
    q_desH_conv_W_into_matrix = q_desH_conv_W.reshape(
        (q_desH_conv_W.shape[0],
         q_desH_conv_W.shape[2] * q_desH_conv_W.shape[3]))

    #     load_model_from_file(rootPath, params, '')

    def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f,
                               rel_word_lens_f, desH_word_ids_f,
                               desH_word_lens_f, men_char_ids_f, q_word_ids_f,
                               men_lens_f, q_word_lens_f):

        #         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape(
            (batch_size, max_char_len,
             char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape(
            (batch_size, max_char_len,
             char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape(
            (batch_size, max_relation_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape(
            (batch_size, max_des_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        #         desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape(
            (batch_size, max_Q_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        #ent_mention
        ent_char_conv = Conv_with_input_para(rng,
                                             input=ent_char_input,
                                             image_shape=(batch_size, 1,
                                                          char_emb_size,
                                                          max_char_len),
                                             filter_shape=char_filter_shape,
                                             W=char_conv_W,
                                             b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng,
                                             input=men_char_input,
                                             image_shape=(batch_size, 1,
                                                          char_emb_size,
                                                          max_char_len),
                                             filter_shape=char_filter_shape,
                                             W=char_conv_W,
                                             b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng,
                                          input=q_word_input,
                                          image_shape=(batch_size, 1, emb_size,
                                                       max_Q_len),
                                          filter_shape=word_filter_shape,
                                          W=q_rel_conv_W,
                                          b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng,
                                        input=rel_word_input,
                                        image_shape=(batch_size, 1, emb_size,
                                                     max_relation_len),
                                        filter_shape=word_filter_shape,
                                        W=q_rel_conv_W,
                                        b=q_rel_conv_b)
        #q_desH
        q_desH_conv = Conv_with_input_para(rng,
                                           input=q_word_input,
                                           image_shape=(batch_size, 1,
                                                        emb_size, max_Q_len),
                                           filter_shape=word_filter_shape,
                                           W=q_desH_conv_W,
                                           b=q_desH_conv_b)
        desH_conv = Conv_with_input_para(rng,
                                         input=desH_word_input,
                                         image_shape=(batch_size, 1, emb_size,
                                                      max_des_len),
                                         filter_shape=word_filter_shape,
                                         W=q_desH_conv_W,
                                         b=q_desH_conv_b)
        #         #q_desT
        #         q_desT_conv = Conv_with_input_para(rng, input=q_word_input,
        #                 image_shape=(batch_size, 1, emb_size, max_Q_len),
        #                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
        #         desT_conv = Conv_with_input_para(rng, input=desT_word_input,
        #                 image_shape=(batch_size, 1, emb_size, max_des_len),
        #                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
        #     ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output')
        #     men_char_output=debug_print(men_char_conv.output, 'men_char.output')

        ent_conv_pool = Max_Pooling(rng,
                                    input_l=ent_char_conv.output,
                                    left_l=ent_lens_f[0],
                                    right_l=ent_lens_f[2])
        men_conv_pool = Max_Pooling(rng,
                                    input_l=men_char_conv.output,
                                    left_l=men_lens_f[0],
                                    right_l=men_lens_f[2])

        #         q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool = Max_Pooling(rng,
                                    input_l=rel_conv.output,
                                    left_l=rel_word_lens_f[0],
                                    right_l=rel_word_lens_f[2])
        q_rel_pool = Average_Pooling_for_SimpleQA(
            rng,
            input_l=q_rel_conv.output,
            input_r=rel_conv_pool.output_maxpooling,
            left_l=q_word_lens_f[0],
            right_l=q_word_lens_f[2],
            length_l=q_word_lens_f[1] + filter_size[1] - 1,
            dim=max_Q_len + filter_size[1] - 1,
            topk=2)

        q_desH_pool = Max_Pooling(rng,
                                  input_l=q_desH_conv.output,
                                  left_l=q_word_lens_f[0],
                                  right_l=q_word_lens_f[2])
        desH_conv_pool = Max_Pooling(rng,
                                     input_l=desH_conv.output,
                                     left_l=desH_word_lens_f[0],
                                     right_l=desH_word_lens_f[2])

        #         q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2])
        #         desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2])


        overall_simi=(cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\
                    cosine(q_rel_pool.topk_max_pooling, rel_conv_pool.output_maxpooling)+\
                    0.1*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling))/3.0

        #                     cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi

    simi_list, updates = theano.scan(SimpleQ_matches_Triple,
                                     sequences=[
                                         ent_char_ids_M, ent_lens_M,
                                         rel_word_ids_M, rel_word_lens_M,
                                         desH_word_ids_M, desH_word_lens_M,
                                         men_char_ids_M, q_word_ids_M,
                                         men_lens_M, q_word_lens_M
                                     ])

    simi_list += 0.5 * ent_scores

    posi_simi = simi_list[0]
    nega_simies = simi_list[1:]
    loss_simi_list = T.maximum(
        0.0, margin - posi_simi.reshape((1, 1)) + nega_simies)
    loss_simi = T.mean(loss_simi_list)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (char_embeddings**2).sum() + (embeddings**2).sum() +
        (char_conv_W**2).sum() + (q_rel_conv_W**2).sum() +
        (q_desH_conv_W**2).sum(),
        'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg = Diversify_Reg(char_conv_W_into_matrix) + Diversify_Reg(
        q_rel_conv_W_into_matrix) + Diversify_Reg(q_desH_conv_W_into_matrix)
    cost = loss_simi + L2_weight * L2_reg + Div_reg * diversify_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function([
        ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M,
        rel_word_lens_M, desH_word_ids_M, desH_word_lens_M, q_word_ids_M,
        q_word_lens_M, ent_scores
    ], [loss_simi, simi_list],
                                 on_unused_input='ignore')
    #           givens={
    #             ent_char_ids_M : test_pos_entity_char[index].reshape((length_per_example_test[index], max_char_len)),
    #             ent_lens_M : test_entity_char_lengths[index].reshape((length_per_example_test[index], 3)),
    #             men_char_ids_M : test_mention_char_ids[index].reshape((length_per_example_test[index], max_char_len)),
    #             men_lens_M : test_mention_char_lens[index].reshape((length_per_example_test[index], 3)),
    #             rel_word_ids_M : test_relations[index].reshape((length_per_example_test[index], max_relation_len)),
    #             rel_word_lens_M : test_relation_lengths[index].reshape((length_per_example_test[index], 3)),
    #             desH_word_ids_M : test_pos_entity_des[index].reshape((length_per_example_test[index], max_des_len)),
    #             desH_word_lens_M : test_entity_des_lengths[index].reshape((length_per_example_test[index], 3)),
    # #             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2],
    # #             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
    #             q_word_ids_M : test_remainQ_word_ids[index].reshape((length_per_example_test[index], max_Q_len)),
    #             q_word_lens_M : test_remainQ_word_len[index].reshape((length_per_example_test[index], 3)),
    #             ent_scores : test_entity_scores[index]},

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    #+[embeddings]# + layer1.params
    #     params_conv = [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        #         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        #         updates.append((acc_i, acc))
        if param_i == embeddings:
            updates.append(
                (param_i,
                 T.set_subtensor(
                     (param_i -
                      learning_rate * grad_i / T.sqrt(acc + 1e-10))[0],
                     theano.shared(numpy.zeros(emb_size)))))  #Ada
        elif param_i == char_embeddings:
            updates.append(
                (param_i,
                 T.set_subtensor(
                     (param_i -
                      learning_rate * grad_i / T.sqrt(acc + 1e-10))[0],
                     theano.shared(numpy.zeros(char_emb_size)))))  #AdaGrad
        else:
            updates.append(
                (param_i, param_i -
                 learning_rate * grad_i / T.sqrt(acc + 1e-10)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index, chosed_indices],
        [loss_simi, cost],
        updates=updates,
        givens={
            ent_char_ids_M:
            indices_train_pos_entity_char[index].reshape(
                (neg_all, max_char_len))[chosed_indices].reshape(
                    (train_neg_size, max_char_len)),
            ent_lens_M:
            indices_train_entity_char_lengths[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            men_char_ids_M:
            indices_train_mention_char_ids[index].reshape(
                (neg_all, max_char_len))[chosed_indices].reshape(
                    (train_neg_size, max_char_len)),
            men_lens_M:
            indices_train_mention_char_lens[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            rel_word_ids_M:
            indices_train_relations[index].reshape(
                (neg_all, max_relation_len))[chosed_indices].reshape(
                    (train_neg_size, max_relation_len)),
            rel_word_lens_M:
            indices_train_relation_lengths[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            desH_word_ids_M:
            indices_train_pos_entity_des[index].reshape(
                (neg_all, max_des_len))[chosed_indices].reshape(
                    (train_neg_size, max_des_len)),
            desH_word_lens_M:
            indices_train_entity_des_lengths[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            #             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2],
            #             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
            q_word_ids_M:
            indices_train_remainQ_word_ids[index].reshape(
                (neg_all, max_Q_len))[chosed_indices].reshape(
                    (train_neg_size, max_Q_len)),
            q_word_lens_M:
            indices_train_remainQ_word_len[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            ent_scores:
            indices_train_entity_scores[index][chosed_indices]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    best_test_accu = 0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0

        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #print batch_start
            sample_indices = [0] + random.sample(range(1, neg_all),
                                                 train_neg_size - 1)
            loss_simi_i, cost_i = train_model(batch_start, sample_indices)
            #             if batch_start%1==0:
            #                 print batch_start, '\t loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #                 store_model_to_file(rootPath, params)

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + '\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)
#
            if iter % n_train_batches == 0:

                test_loss = []
                succ = 0
                for i in range(test_size):
                    #                     print 'testing', i, '...'
                    #prepare data
                    test_ent_char_ids_M = numpy.asarray(
                        test_pos_entity_char[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_char_len))
                    test_ent_lens_M = numpy.asarray(
                        test_entity_char_lengths[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_men_char_ids_M = numpy.asarray(
                        test_mention_char_ids[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_char_len))
                    test_men_lens_M = numpy.asarray(
                        test_mention_char_lens[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_rel_word_ids_M = numpy.asarray(
                        test_relations[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_relation_len))
                    test_rel_word_lens_M = numpy.asarray(
                        test_relation_lengths[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_desH_word_ids_M = numpy.asarray(
                        test_pos_entity_des[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_des_len))
                    test_desH_word_lens_M = numpy.asarray(
                        test_entity_des_lengths[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_q_word_ids_M = numpy.asarray(
                        test_remainQ_word_ids[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_Q_len))
                    test_q_word_lens_M = numpy.asarray(
                        test_remainQ_word_len[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_ent_scores = numpy.asarray(test_entity_scores[i],
                                                    dtype=theano.config.floatX)

                    loss_simi_i, simi_list_i = test_model(
                        test_ent_char_ids_M, test_ent_lens_M,
                        test_men_char_ids_M, test_men_lens_M,
                        test_rel_word_ids_M, test_rel_word_lens_M,
                        test_desH_word_ids_M, test_desH_word_lens_M,
                        test_q_word_ids_M, test_q_word_lens_M, test_ent_scores)
                    #                     print 'simi_list_i:', simi_list_i[:10]
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0] >= max(simi_list_i[1:]):
                        succ += 1


#                     print 'testing', i, '...acc:', succ*1.0/(i+1)
                succ = succ * 1.0 / test_size
                #now, check MAP and MRR
                print((
                    '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                    'model %f') %
                      (epoch, minibatch_index, n_train_batches, succ))

                if best_test_accu < succ:
                    best_test_accu = succ
                    store_model_to_file(rootPath, params, mark)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, word_nkerns=300, batch_size=1, window_width=[3,3],
                    emb_size=300, 
                    margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, 
                    max_relation_len=6, max_Q_len=30, 
                    neg_all=100, train_size=69967, test_size=19953, mark='_RC_newdata'):  #train_size=75909, test_size=17386
#     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/home/wyin/Datasets/SimpleQuestions_v2/relation_classification/'
    triple_files=['train.replace_ne.withpoolwenpengFormat.txt', 'test.replace_ne.withpoolwenpengFormat.txt']

    rng = numpy.random.RandomState(23455)
    datasets, datasets_test, length_per_example_train, length_per_example_test, vocab_size=load_train(triple_files[0], triple_files[1], max_relation_len, max_Q_len, train_size, test_size, mark)#max_char_len, max_des_len, max_relation_len, max_Q_len

    
    print 'vocab_size:', vocab_size

    train_data=datasets
#     valid_data=datasets[1]
    test_data=datasets_test
#     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
#     

    train_relations=train_data[0]
    train_relation_lengths=train_data[1]
    train_remainQ_word_ids=train_data[2]
    train_remainQ_word_len=train_data[3]

    test_relations=test_data[0]
    test_relation_lengths=test_data[1]
    test_remainQ_word_ids=test_data[2]
    test_remainQ_word_len=test_data[3]


    

    train_sizes=[len(train_relations),len(train_relation_lengths),len(train_remainQ_word_ids), len(train_remainQ_word_len)]
    if sum(train_sizes)/len(train_sizes)!=train_size:
        print 'weird size:', train_sizes
        exit(0)

    test_sizes=[len(test_relations),len(test_relation_lengths),  len(test_remainQ_word_ids),len(test_remainQ_word_len)]
    if sum(test_sizes)/len(test_sizes)!=test_size:
        print 'weird size:', test_sizes
        exit(0)

    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    

    
    
#     indices_train_pos_entity_char=theano.shared(numpy.asarray(train_pos_entity_char, dtype='int32'), borrow=True)
#     indices_train_pos_entity_des=theano.shared(numpy.asarray(train_pos_entity_des, dtype='int32'), borrow=True)
#     indices_train_relations=theano.shared(numpy.asarray(train_relations, dtype='int32'), borrow=True)
#     indices_train_entity_char_lengths=theano.shared(numpy.asarray(train_entity_char_lengths, dtype='int32'), borrow=True)
#     indices_train_entity_des_lengths=theano.shared(numpy.asarray(train_entity_des_lengths, dtype='int32'), borrow=True)
#     indices_train_relation_lengths=theano.shared(numpy.asarray(train_relation_lengths, dtype='int32'), borrow=True)
#     indices_train_mention_char_ids=theano.shared(numpy.asarray(train_mention_char_ids, dtype='int32'), borrow=True)
#     indices_train_remainQ_word_ids=theano.shared(numpy.asarray(train_remainQ_word_ids, dtype='int32'), borrow=True)
#     indices_train_mention_char_lens=theano.shared(numpy.asarray(train_mention_char_lens, dtype='int32'), borrow=True)
#     indices_train_remainQ_word_len=theano.shared(numpy.asarray(train_remainQ_word_len, dtype='int32'), borrow=True)
#     indices_train_entity_scores=theano.shared(numpy.asarray(train_entity_scores, dtype=theano.config.floatX), borrow=True)
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
#     rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    

    
    # allocate symbolic variables for the data
    index = T.iscalar()
    rel_word_ids_M=T.imatrix()
    rel_word_lens_M=T.imatrix()
    q_word_ids_f=T.ivector()
    q_word_lens_f=T.ivector()

    
    filter_size=(emb_size,window_width[0])
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    
    word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1])
    q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    params = [embeddings,q_rel_conv_W, q_rel_conv_b]
    q_rel_conv_W_into_matrix=q_rel_conv_W.reshape((q_rel_conv_W.shape[0], q_rel_conv_W.shape[2]*q_rel_conv_W.shape[3]))
#     load_model_from_file(rootPath, params, '')

    def SimpleQ_matches_Triple(rel_word_ids_f,rel_word_lens_f):
        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        #q-rel
        q_rel_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng, input=rel_word_input,
                image_shape=(batch_size, 1, emb_size, max_relation_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)

        
#         q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2])
        q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, 
                                                left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, 
                                                dim=max_Q_len+filter_size[1]-1, topk=2)
        
   
        
        
        overall_simi=cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)
        return overall_simi
    
    simi_list, updates = theano.scan(
        SimpleQ_matches_Triple,
                sequences=[rel_word_ids_M,rel_word_lens_M])

    
    posi_simi=simi_list[0]
    nega_simies=simi_list[1:]
    loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) 
    loss_simi=T.sum(loss_simi_list)

    

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((embeddings** 2).sum()+(q_rel_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg= Diversify_Reg(q_rel_conv_W_into_matrix)
    cost=loss_simi+L2_weight*L2_reg+Div_reg*diversify_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    



    test_model = theano.function([rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f], [loss_simi, simi_list],on_unused_input='ignore')
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
#         updates.append((acc_i, acc))    
        if param_i == embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size)))))   #Ada
        else:
            updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc)) 

    train_model = theano.function([rel_word_ids_M, rel_word_lens_M, q_word_ids_f, q_word_lens_f], [loss_simi, cost],updates=updates, on_unused_input='ignore')
      
#     train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates,
#           givens={
#             rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)),  
#             rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
#             q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), 
#             q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3))
#             
#             }, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    best_test_accu=0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0


        for jj in range(train_size): 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
 
            minibatch_index=minibatch_index+1
            #print batch_start
            train_rel_word_ids_M = numpy.asarray(train_relations[jj], dtype='int32').reshape((length_per_example_train[jj], max_relation_len))  
            train_rel_word_lens_M = numpy.asarray(train_relation_lengths[jj], dtype='int32').reshape((length_per_example_train[jj], 3))
            train_q_word_ids_M = numpy.asarray(train_remainQ_word_ids[jj], dtype='int32')#.reshape((length_per_example_train[jj], max_Q_len))
            train_q_word_lens_M = numpy.asarray(train_remainQ_word_len[jj], dtype='int32')#.reshape((length_per_example_train[jj], 3))
            loss_simi_i, cost_i=train_model(train_rel_word_ids_M, train_rel_word_lens_M,train_q_word_ids_M, train_q_word_lens_M)

 
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+'\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)
#             
            if iter > 59999 and iter % 10000 == 0:
                 
                test_loss=[]
                succ=0
                for i in range(test_size):
#                     print 'testing', i, '...'
                    #prepare data
                    test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int32').reshape((length_per_example_test[i], max_relation_len))  
                    test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3))
                    test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int32')#.reshape((length_per_example_test[i], max_Q_len))
                    test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int32')#.reshape((length_per_example_test[i], 3))
                    loss_simi_i,simi_list_i=test_model(test_rel_word_ids_M, test_rel_word_lens_M,test_q_word_ids_M, test_q_word_lens_M)
#                     print 'simi_list_i:', simi_list_i[:10]
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0]>=max(simi_list_i[1:]):
                        succ+=1
#                     print 'testing', i, '...acc:', succ*1.0/(i+1)
                succ=(succ+20610-test_size)*1.0/20610
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                           'model %f') %
                          (epoch, minibatch_index, n_train_batches,succ))

                if best_test_accu< succ:
                    best_test_accu=succ
                    store_model_to_file(rootPath, params, mark)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock() 

            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3,
                    maxSentLength=30, emb_size=300, hidden_size=[300,10],
                    margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    vocab_size=len(word2id)+1
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2]
    indices_train_r=indices_train[1::2]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2]
    indices_test_r=indices_test[1::2]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    train_size = len(indices_train_l)
    test_size = len(indices_test_l)
    
    train_batch_start=range(train_size)
    test_batch_start=range(test_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int32')
#     indices_train_r=T.cast(indices_train_r, 'int32')
#     indices_test_l=T.cast(indices_test_l, 'int32')
#     indices_test_r=T.cast(indices_test_r, 'int32')
    


    rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng)
#     rand_values[0]=numpy.array(numpy.zeros(emb_size))
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True)      
    

    
    # allocate symbolic variables for the data
#     index = T.iscalar()
    x_index_l = T.imatrix()   # now, x is the index matrix, must be integer
    x_index_r = T.imatrix()
    y = T.ivector()  
    left_l=T.iscalar()
    right_l=T.iscalar()
    left_r=T.iscalar()
    right_r=T.iscalar()
    length_l=T.iscalar()
    length_r=T.iscalar()
    norm_length_l=T.fscalar()
    norm_length_r=T.fscalar()
    mts=T.fmatrix()
    wmf=T.fmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))
    conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0]))
    layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0]))
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    

    
    
    
    
    
    sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size))
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size))
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    HL_layer_1_input=T.concatenate([
#                                 mts, 
                                eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                uni_cosine,
#                                 sum_uni_l,
#                                 sum_uni_r,
#                                 sum_uni_l+sum_uni_r,
                                1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)),
                                cosine(layer0_l_output_maxpool, layer0_r_output_maxpool),
                                layer0_l_output_maxpool,
                                layer0_r_output_maxpool,
                                T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10),
                                
                                layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                layer1.output_cosine,
                                layer1.output_vector_l,
                                layer1.output_vector_r,
                                T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10),
#                                 len_l, len_r
                                layer1.output_attentions
#                                 wmf,
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input,
                                mts, len_l, len_r
#                                 wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)

    HL_layer_1_input_size=1+1+   1+1+3* nkerns[0]   +1+1+3*nkerns[0]+10*10
    
    HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2
    
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh)
    
    LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1)
    LR_layer_input_with_extra=T.concatenate([HL_layer_2.output,  HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output,
    
    LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2)
#     LR_layer_input=HL_layer_2.output
#     LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2)

#     layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
#     diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix)
    cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg
    

    test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True)



    params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        clipped_grad = T.clip(grad_i, -0.5, 0.5)
        acc = acc_i + T.sqr(clipped_grad)
        updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True)

    train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r,
                                  mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is


    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    nn_max_acc=0.0
    best_iter=0
    cost_tmp=0.0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data

        for index in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * train_size + minibatch_index +1

            minibatch_index=minibatch_index+1

#             if iter%update_freq != 0:
#                 cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
#                 #print 'cost_ij: ', cost_ij
#                 cost_tmp+=cost_ij
#                 error_sum+=error_ij
#             else:

            cost_i, error_i= train_model(indices_train_l[index: index + batch_size],
                                                              indices_train_r[index: index + batch_size],
                                                              trainY[index: index + batch_size],
                                                              trainLeftPad_l[index],
                                                              trainRightPad_l[index],
                                                              trainLeftPad_r[index],
                                                              trainRightPad_r[index],
                                                              trainLengths_l[index],
                                                              trainLengths_r[index],
                                                              normalized_train_length_l[index],
                                                              normalized_train_length_r[index],
                                                              mt_train[index: index + batch_size],
                                                              wm_train[index: index + batch_size])
            cost_tmp+=cost_i
            if iter < 6000 and iter %100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
            if iter >= 6000 and iter % 100 == 0:
#             if iter%100 ==0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter)
                test_losses=[]
                test_y=[]
                test_features=[]
                for index in test_batch_start:
                    test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size],
                                                                  indices_test_r[index: index + batch_size],
                                                                  testY[index: index + batch_size],
                                                                  testLeftPad_l[index],
                                                                  testRightPad_l[index],
                                                                  testLeftPad_r[index],
                                                                  testRightPad_r[index],
                                                                  testLengths_l[index],
                                                                  testLengths_r[index],
                                                                  normalized_test_length_l[index],
                                                                  normalized_test_length_r[index],
                                                                  mt_test[index: index + batch_size],
                                                                  wm_test[index: index + batch_size])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_acc = (1-test_score) * 100.
                if test_acc > nn_max_acc:
                    nn_max_acc = test_acc
                print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc

                #now, see the results of svm
                if use_svm:
                    train_y=[]
                    train_features=[]
                    for index in train_batch_start: 
                        cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size],
                                                                  indices_train_r[index: index + batch_size],
                                                                  trainY[index: index + batch_size],
                                                                  trainLeftPad_l[index],
                                                                  trainRightPad_l[index],
                                                                  trainLeftPad_r[index],
                                                                  trainRightPad_r[index],
                                                                  trainLengths_l[index],
                                                                  trainLengths_r[index],
                                                                  normalized_train_length_l[index],
                                                                  normalized_train_length_r[index],
                                                                  mt_train[index: index + batch_size],
                                                                  wm_train[index: index + batch_size])
                        train_y.append(y[0])
                        train_features.append(layer3_input[0])
                        #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n')
                    #write_feature.close()
     
                    clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                    clf.fit(train_features, train_y)
                    results=clf.predict(test_features)
                    lr=LinearRegression().fit(train_features, train_y)
                    results_lr=lr.predict(test_features)
                    corr_count=0
                    corr_lr=0
                    test_size=len(test_y)
                    for i in range(test_size):
                        if results[i]==test_y[i]:
                            corr_count+=1
                        if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                            corr_lr+=1
                    acc=corr_count*1.0/test_size
                    acc_lr=corr_lr*1.0/test_size
                    if acc > max_acc:
                        max_acc=acc
                        best_iter=iter
                    if acc_lr> max_acc:
                        max_acc=acc_lr
                        best_iter=iter
                    print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ',    max_acc , ' at iter: ', best_iter

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #15
0
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=50, char_nkerns=4, batch_size=1, window_width=[2, 5],
                    emb_size=50, char_emb_size=4, hidden_size=200,
                    margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, 
                    max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=21, 
                    neg_all=100, train_size=200, test_size=200, mark='_forfun'):  #train_size=75909, test_size=17386
#     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files=['annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.txt']

    rng = numpy.random.RandomState(23455)
    datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_train(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size, mark)#max_char_len, max_des_len, max_relation_len, max_Q_len

    
    print 'vocab_size:', vocab_size, 'char_size:', char_size

    train_data=datasets
#     valid_data=datasets[1]
    test_data=datasets_test
#     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
#     
    train_pos_entity_char=train_data[0]
    train_pos_entity_des=train_data[1]
    train_relations=train_data[2]
    train_entity_char_lengths=train_data[3]
    train_entity_des_lengths=train_data[4]
    train_relation_lengths=train_data[5]
    train_mention_char_ids=train_data[6]
    train_remainQ_word_ids=train_data[7]
    train_mention_char_lens=train_data[8]
    train_remainQ_word_len=train_data[9]
    train_entity_scores=train_data[10]

    test_pos_entity_char=test_data[0]
    test_pos_entity_des=test_data[1]
    test_relations=test_data[2]
    test_entity_char_lengths=test_data[3]
    test_entity_des_lengths=test_data[4]
    test_relation_lengths=test_data[5]
    test_mention_char_ids=test_data[6]
    test_remainQ_word_ids=test_data[7]
    test_mention_char_lens=test_data[8]
    test_remainQ_word_len=test_data[9]
    test_entity_scores=test_data[10]
# 
#     test_pos_entity_char=test_data[0]       #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
#     test_pos_entity_des=test_data[1]        #matrix, each row for a examle: 20*2*51
#     test_relations=test_data[2]             #matrix, each row for a example: 5*51
#     test_entity_char_lengths=test_data[3]   #matrix, each row for a example: 3*2*51  (three valies for one entity)
#     test_entity_des_lengths=test_data[4]    #matrix, each row for a example: 3*2*51  (three values for one entity)
#     test_relation_lengths=test_data[5]      #matrix, each row for a example: 3*51
#     test_mention_char_ids=test_data[6]      #matrix, each row for a mention: 40
#     test_remainQ_word_ids=test_data[7]      #matrix, each row for a question: 30
#     test_mention_char_lens=test_data[8]     #matrix, each three values for a mention: 3
#     test_remainQ_word_len=test_data[9]      #matrix, each three values for a remain question: 3
    

    train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
           len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)]
    if sum(train_sizes)/len(train_sizes)!=train_size:
        print 'weird size:', train_sizes
        exit(0)

    test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)]
    if sum(test_sizes)/len(test_sizes)!=test_size:
        print 'weird size:', test_sizes
        exit(0)

    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)
    
    indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char)
    indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des)
    indices_train_relations=pythonList_into_theanoIntMatrix(train_relations)
    indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths)
    indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths)
    indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths)
    indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids)
    indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids)
    indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens)
    indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len)   
    indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) 
    
#     indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char)
#     indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des)
#     indices_test_relations=pythonList_into_theanoIntMatrix(test_relations)
#     indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths)
#     indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths)
#     indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths)
#     indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids)
#     indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids)
#     indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens)
#     indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len)   
#     indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores)

    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      

    char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
    index = T.lscalar()
    chosed_indices=T.lvector()
    
    ent_char_ids_M = T.lmatrix()   
    ent_lens_M = T.lmatrix()
    men_char_ids_M = T.lmatrix()  
    men_lens_M=T.lmatrix()
    rel_word_ids_M=T.lmatrix()
    rel_word_lens_M=T.lmatrix()
    desH_word_ids_M=T.lmatrix()
    desH_word_lens_M=T.lmatrix()
#     desT_word_ids_M=T.lmatrix()
#     desT_word_lens_M=T.lmatrix()
    q_word_ids_M=T.lmatrix()
    q_word_lens_M=T.lmatrix()
    ent_scores=T.dvector()

#max_char_len, max_des_len, max_relation_len, max_Q_len
#     ent_men_ishape = (char_emb_size, max_char_len)  # this is the size of MNIST images
#     rel_ishape=(emb_size, max_relation_len)
#     des_ishape=(emb_size, max_des_len)
#     q_ishape=(emb_size, max_Q_len)
    
    filter_size=(emb_size,window_width[0])
    char_filter_size=(char_emb_size, window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1])
    word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b]
    char_conv_W_into_matrix=char_conv_W.reshape((char_conv_W.shape[0], char_conv_W.shape[2]*char_conv_W.shape[3]))
    q_rel_conv_W_into_matrix=q_rel_conv_W.reshape((q_rel_conv_W.shape[0], q_rel_conv_W.shape[2]*q_rel_conv_W.shape[3]))
    q_desH_conv_W_into_matrix=q_desH_conv_W.reshape((q_desH_conv_W.shape[0], q_desH_conv_W.shape[2]*q_desH_conv_W.shape[3]))
#     load_model_from_file(rootPath, params, '')

    def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f,desH_word_ids_f,
                       desH_word_lens_f,
                       men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f):
        

#         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
#         desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
        #ent_mention
        ent_char_conv = Conv_with_input_para(rng, input=ent_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng, input=men_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng, input=rel_word_input,
                image_shape=(batch_size, 1, emb_size, max_relation_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        #q_desH
        q_desH_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
        desH_conv = Conv_with_input_para(rng, input=desH_word_input,
                image_shape=(batch_size, 1, emb_size, max_des_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
#         #q_desT
#         q_desT_conv = Conv_with_input_para(rng, input=q_word_input,
#                 image_shape=(batch_size, 1, emb_size, max_Q_len),
#                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
#         desT_conv = Conv_with_input_para(rng, input=desT_word_input,
#                 image_shape=(batch_size, 1, emb_size, max_des_len),
#                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
    #     ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output')
    #     men_char_output=debug_print(men_char_conv.output, 'men_char.output')
        
        
        
        ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2])
        men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2])
        
#         q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2])
        q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, 
                                                left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, 
                                                dim=max_Q_len+filter_size[1]-1, topk=2)
        
        
        q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2])
        
#         q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2])
#         desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2])    
        
        
        overall_simi=(cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\
                    cosine(q_rel_pool.topk_max_pooling, rel_conv_pool.output_maxpooling)+\
                    0.1*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling))/3.0

#                     cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi
    
    simi_list, updates = theano.scan(
        SimpleQ_matches_Triple,
                sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M,desH_word_ids_M,
                   desH_word_lens_M,
                   men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M])
    
    simi_list+=0.5*ent_scores
    
    posi_simi=simi_list[0]
    nega_simies=simi_list[1:]
    loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) 
    loss_simi=T.mean(loss_simi_list)

    

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((char_embeddings** 2).sum()+(embeddings** 2).sum()+(char_conv_W** 2).sum()+(q_rel_conv_W** 2).sum()+(q_desH_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg= Diversify_Reg(char_conv_W_into_matrix)+Diversify_Reg(q_rel_conv_W_into_matrix)+Diversify_Reg(q_desH_conv_W_into_matrix)
    cost=loss_simi+L2_weight*L2_reg+Div_reg*diversify_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    



    test_model = theano.function([ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, desH_word_ids_M, desH_word_lens_M,
                                  q_word_ids_M, q_word_lens_M, ent_scores], [loss_simi, simi_list],on_unused_input='ignore')
#           givens={
#             ent_char_ids_M : test_pos_entity_char[index].reshape((length_per_example_test[index], max_char_len)),  
#             ent_lens_M : test_entity_char_lengths[index].reshape((length_per_example_test[index], 3)),
#             men_char_ids_M : test_mention_char_ids[index].reshape((length_per_example_test[index], max_char_len)),  
#             men_lens_M : test_mention_char_lens[index].reshape((length_per_example_test[index], 3)),
#             rel_word_ids_M : test_relations[index].reshape((length_per_example_test[index], max_relation_len)),  
#             rel_word_lens_M : test_relation_lengths[index].reshape((length_per_example_test[index], 3)),
#             desH_word_ids_M : test_pos_entity_des[index].reshape((length_per_example_test[index], max_des_len)), 
#             desH_word_lens_M : test_entity_des_lengths[index].reshape((length_per_example_test[index], 3)),
# #             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], 
# #             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
#             q_word_ids_M : test_remainQ_word_ids[index].reshape((length_per_example_test[index], max_Q_len)), 
#             q_word_lens_M : test_remainQ_word_len[index].reshape((length_per_example_test[index], 3)),
#             ent_scores : test_entity_scores[index]},
                                  
    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    #+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
#         updates.append((acc_i, acc))    
        if param_i == embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size)))))   #Ada
        elif param_i == char_embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(char_emb_size)))))   #AdaGrad
        else:
            updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc)) 
  
    train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates,
          givens={
            ent_char_ids_M : indices_train_pos_entity_char[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)),  
            ent_lens_M : indices_train_entity_char_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            men_char_ids_M : indices_train_mention_char_ids[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)),  
            men_lens_M : indices_train_mention_char_lens[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)),  
            rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            desH_word_ids_M : indices_train_pos_entity_des[index].reshape((neg_all, max_des_len))[chosed_indices].reshape((train_neg_size, max_des_len)), 
            desH_word_lens_M : indices_train_entity_des_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
#             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], 
#             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
            q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), 
            q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            ent_scores : indices_train_entity_scores[index][chosed_indices]
            
            }, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    best_test_accu=0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0


        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
 
            minibatch_index=minibatch_index+1
            #print batch_start
            sample_indices=[0]+random.sample(range(1, neg_all), train_neg_size-1)
            loss_simi_i, cost_i= train_model(batch_start, sample_indices)
#             if batch_start%1==0:
#                 print batch_start, '\t loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
#                 store_model_to_file(rootPath, params)
 
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+'\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)
#             
            if iter % n_train_batches == 0:
                 
                test_loss=[]
                succ=0
                for i in range(test_size):
#                     print 'testing', i, '...'
                    #prepare data
                    test_ent_char_ids_M= numpy.asarray(test_pos_entity_char[i], dtype='int64').reshape((length_per_example_test[i], max_char_len))  
                    test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int64').reshape((length_per_example_test[i], max_char_len))
                    test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int64').reshape((length_per_example_test[i], max_relation_len))  
                    test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int64').reshape((length_per_example_test[i], max_des_len))
                    test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int64').reshape((length_per_example_test[i], max_Q_len))
                    test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX)
             
             
             
             
                                
                    loss_simi_i,simi_list_i=test_model(test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M,
                                                       test_desH_word_ids_M, test_desH_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores)
#                     print 'simi_list_i:', simi_list_i[:10]
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0]>=max(simi_list_i[1:]):
                        succ+=1
#                     print 'testing', i, '...acc:', succ*1.0/(i+1)
                succ=succ*1.0/test_size
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                           'model %f') %
                          (epoch, minibatch_index, n_train_batches,succ))

                if best_test_accu< succ:
                    best_test_accu=succ
                    store_model_to_file(rootPath, params, mark)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock() 

            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #16
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=70,
                    filter_size=[3, 1],
                    maxSentLen=70,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered"
    "words keep random initialization"
    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in'
    'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) '
    embed_input_l = init_embeddings[sents_ids_l.flatten(
    )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)
    '''create parameters for attentive convolution function '''
    gate_filter_shape = (emb_size, 1, emb_size, 1)
    conv_W_pre, conv_b_pre = create_conv_para(rng,
                                              filter_shape=gate_filter_shape)
    conv_W_gate, conv_b_gate = create_conv_para(rng,
                                                filter_shape=gate_filter_shape)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    conv_W2_context, conv_b2_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, conv_W_context, conv_W_pre, conv_b_pre, conv_W_gate,
        conv_b_gate, conv_W2, conv_b2, conv_W2_context
    ]

    "A gated convolution layer to form more expressive word representations in each sentence"
    "input tensor3 (batch_size, emb_size, sen_length), output tensor3 (batch_size, emb_size, sen_length)"
    conv_layer_gate_l = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_l,
        mask_matrix=sents_mask_l,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    conv_layer_gate_r = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_r,
        mask_matrix=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    '''
    attentive convolution function, two sizes of filter_width 3&1 are used. Multi-channel
    '''

    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W2,
        b=conv_b2,
        W_context=conv_W2_context,
        b_context=conv_b2_context)
    attentive_sent_embeddings_l2 = attentive_conv_layer2.attentive_maxpool_vec_l
    attentive_sent_embeddings_r2 = attentive_conv_layer2.attentive_maxpool_vec_r

    "Batch normalization for the four output sentence representation vectors"
    gamma = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[0]),
        high=1.0 / math.sqrt(hidden_size[0]),
        size=(hidden_size[0])),
                                     dtype=theano.config.floatX),
                          borrow=True)
    beta = theano.shared(np.zeros((hidden_size[0]),
                                  dtype=theano.config.floatX),
                         borrow=True)
    bn_params = [gamma, beta]
    bn_attentive_sent_embeddings_l = batch_normalization(
        inputs=attentive_sent_embeddings_l,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r = batch_normalization(
        inputs=attentive_sent_embeddings_r,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r.std((0, ), keepdims=True),
        mode='low_mem')

    bn_attentive_sent_embeddings_l2 = batch_normalization(
        inputs=attentive_sent_embeddings_l2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l2.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r2 = batch_normalization(
        inputs=attentive_sent_embeddings_r2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r2.std((0, ), keepdims=True),
        mode='low_mem')

    "Before logistic regression layer, we insert a hidden layer. Now form input to HL classifier"
    HL_layer_1_input = T.concatenate([
        bn_attentive_sent_embeddings_l, bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l + bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l * bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l2, bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 + bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 * bn_attentive_sent_embeddings_r2
    ],
                                     axis=1)
    HL_layer_1_input_size = 8 * hidden_size[0]
    "Create hidden layer parameters"
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, HL_layer_1_input_size, hidden_size[1])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    "Hidden Layer and batch norm to its output again"
    HL_layer_1 = HiddenLayer(rng,
                             input=HL_layer_1_input,
                             n_in=HL_layer_1_input_size,
                             n_out=hidden_size[1],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    gamma_HL = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[1]),
        high=1.0 / math.sqrt(hidden_size[1]),
        size=(hidden_size[1])),
                                        dtype=theano.config.floatX),
                             borrow=True)
    beta_HL = theano.shared(np.zeros((hidden_size[1]),
                                     dtype=theano.config.floatX),
                            borrow=True)
    bn_params_HL = [gamma_HL, beta_HL]
    bn_HL_output = batch_normalization(inputs=HL_layer_1.output,
                                       gamma=gamma_HL,
                                       beta=beta_HL,
                                       mean=HL_layer_1.output.mean(
                                           (0, ), keepdims=True),
                                       std=HL_layer_1.output.std(
                                           (0, ), keepdims=True),
                                       mode='low_mem')
    "Form input to LR classifier"
    LR_input = T.concatenate([HL_layer_1_input, bn_HL_output], axis=1)
    LR_input_size = HL_layer_1_input_size + hidden_size[1]
    U_a = create_ensemble_para(rng, 3, LR_input_size)  # (input_size, 3)
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    "Logistic Regression layer"
    layer_LR = LogisticRegression(
        rng,
        input=normalize_matrix_col_wise(LR_input),
        n_in=LR_input_size,
        n_out=3,
        W=U_a,
        b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [
        init_embeddings
    ] + NN_para + LR_para + bn_params + HL_layer_1_params + bn_params_HL
    cost = loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            if (epoch == 1 and iter % 1000 == 0) or (epoch >= 2
                                                     and iter % 5 == 0):
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                dev_error_sum = 0.0
                for dev_batch_id in dev_batch_start:  # for each test batch
                    dev_error_i = dev_model(
                        dev_sents_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_sents_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_labels_store[dev_batch_id:dev_batch_id +
                                         batch_size])

                    dev_error_sum += dev_error_i
                dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start))

                if dev_acc > max_acc_dev:
                    max_acc_dev = dev_acc
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev
                    '''
                    best dev model, test
                    '''
                    error_sum = 0.0
                    for test_batch_id in test_batch_start:  # for each test batch
                        error_i = test_model(
                            test_sents_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_sents_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_labels_store[test_batch_id:test_batch_id +
                                              batch_size])

                        error_sum += error_i
                    test_acc = 1.0 - error_sum / (len(test_batch_start))

                    if test_acc > max_acc_test:
                        max_acc_test = test_acc
                    print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test
                else:
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Example #17
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3],
                    sent_len=40,
                    claim_len=40,
                    cand_size=10,
                    hidden_size=[300, 300],
                    max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}
    root = '/save/wenpeng/datasets/FEVER/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    vocabfile = codecs.open(root + 'word2id.txt', 'r', 'utf-8')
    word2id = json.loads(vocabfile.read())
    # co=0
    # for line in vocabfile:
    #     word2id = json.loads(line)
    #     co+=1
    # print 'co: ', co
    # word2id = json.load(open(root+'word2id.json')) #json.loads(vocabfile)
    vocabfile.close()
    print 'load word2id over'
    # train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id  = load_fever_train(sent_len, claim_len, cand_size)
    # train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(sent_len, claim_len, cand_size, word2id)
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, _ = load_SciTailV1_dataset(
        sent_len, word2id)
    # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, _ = load_RTE_dataset_as_test(sent_len, word2id)

    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    # dev_size=len(dev_labels_store)
    test_size = len(test_labels_store)

    vocab_size = len(word2id) + 1

    print 'vocab size: ', vocab_size

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    # id2word = {y:x for x,y in word2id.iteritems()}
    # word2vec=load_word2vec()
    # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids = T.imatrix()  #(batch, cand_size, sent_len)
    sents_mask = T.fmatrix()
    # sents_labels=T.imatrix() #(batch, cand_size)
    # claim_ids = T.imatrix() #(batch, claim_len)
    # claim_mask = T.fmatrix()

    # joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    # joint_sents_mask=T.ftensor3()
    # joint_sents_labels=T.imatrix() #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()
    labels = T.ivector()

    # test_premise_ids = T.imatrix()
    # test_premise_matrix = T.fmatrix()
    # test_hypo_ids = T.imatrix()
    # test_hypo_matrix = T.fmatrix()
    # test_scitail_minibatch_labels = T.ivector()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, sent_len, emb_size)).dimshuffle(0, 2, 1)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    # task1_att_conv_W, task1_att_conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    # task1_conv_W_context, task1_conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, att_conv_W, att_conv_b, conv_W_context, conv_b_context
    ]
    '''
    training task2, predict 3 labels
    '''
    joint_embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    joint_embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, sent_len, emb_size)).dimshuffle(0, 2, 1)
    joint_conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_sents,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_premise_emb = joint_conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    # joint_batch_sent_emb = joint_sent_embeddings.reshape((batch_size, cand_size, hidden_size[0]))
    # joint_premise_emb = T.sum(joint_batch_sent_emb*joint_sents_labels.dimshuffle(0,1,'x'), axis=1) #(batch, hidden_size)

    joint_conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_claim_embeddings = joint_conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    joint_premise_hypo_emb = T.concatenate(
        [joint_premise_emb, joint_claim_embeddings],
        axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    # joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0,2,1).reshape((batch_size, cand_size*sent_len, emb_size))
    # joint_sents_dot = T.batched_dot(joint_sents_tensor3, joint_sents_tensor3.dimshuffle(0,2,1)) #(batch_size, cand_size*sent_len, cand_size*sent_len)
    # joint_sents_dot_2_matrix = T.nnet.softmax(joint_sents_dot.reshape((batch_size*cand_size*sent_len, cand_size*sent_len)))
    # joint_sents_context = T.batched_dot(joint_sents_dot_2_matrix.reshape((batch_size, cand_size*sent_len, cand_size*sent_len)), joint_sents_tensor3) #(batch_size, cand_size*sent_len, emb_size)
    # joint_add_sents_context = joint_embed_input_sents+joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        joint_embed_input_sents,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=joint_embed_input_claim,
        mask_matrix=sents_mask,
        mask_matrix_r=claim_mask,
        image_shape=(batch_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([
        joint_premise_hypo_emb, attentive_sent_embeddings_l,
        attentive_sent_embeddings_r
    ],
                                   axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    # joint_loss=joint_layer_LR.negative_log_likelihood(joint_labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    joint_sents_mask=T.ftensor3()
    joint_sents_labels=T.imatrix() #(batch, cand_size)
    joint_claim_ids = T.imatrix() #(batch, claim_len)
    joint_claim_mask = T.fmatrix()
    joint_labels=T.ivector()
    '''
    pred_minibatch_labels = joint_layer_LR.y_pred
    pred_minibatch_labels_2_2classes = T.where(pred_minibatch_labels > 1, 0,
                                               pred_minibatch_labels)

    pred_minibatch_error = T.mean(
        T.neq(pred_minibatch_labels_2_2classes, labels))

    params = [init_embeddings] + NN_para + joint_LR_para
    load_model_from_file(root + 'para_for_test_scitail', params)

    # train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask,joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids, sents_mask, claim_ids, claim_mask, labels],
        pred_minibatch_error,
        allow_input_downcast=True,
        on_unused_input='ignore')
    # dev_model = theano.function([joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], pred_minibatch_error, allow_input_downcast=True, on_unused_input='ignore')

    # test_model = theano.function([sents_ids,sents_mask,sents_labels, claim_ids,claim_mask, joint_labels], [inter_matrix,test_layer_LR.errors(joint_labels), test_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore')
    # dev_model = theano.function([sents_ids,sents_mask,sents_labels, claim_ids,claim_mask, joint_labels], [inter_matrix,test_layer_LR.errors(joint_labels), test_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... testing'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    # joint_n_train_batches=joint_train_size/batch_size
    # joint_train_batch_start=list(np.arange(joint_n_train_batches)*batch_size)+[joint_train_size-batch_size]
    # n_train_batches=train_size/batch_size
    # train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]

    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    remain_test_batches = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0

    error_sum = 0.0
    for idd, test_batch_id in enumerate(
            test_batch_start):  # for each test batch
        error_i = test_model(
            test_sents_l[test_batch_id:test_batch_id + batch_size],
            test_masks_l[test_batch_id:test_batch_id + batch_size],
            test_sents_r[test_batch_id:test_batch_id + batch_size],
            test_masks_r[test_batch_id:test_batch_id + batch_size],
            test_labels_store[test_batch_id:test_batch_id + batch_size])
        error_sum += error_i
    test_acc = 1.0 - error_sum / (len(test_batch_start))
    print '\tcurrent test_acc:', test_acc
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=10,
                    filter_size=[3, 5],
                    maxSentLen=40,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    NN_para = [conv_W, conv_b, conv_W2, conv_b2]
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2], axis=1)
    LR_input_size = hidden_size[0] * 2
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 12, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))

    # loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [embeddings
              ] + NN_para + LR_para  # put all model parameters together
    cost = loss  #+Div_reg*diversify_reg#+L2_weight*L2_reg
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''
    binarize_prob = T.where(score_matrix > 0.5, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_id_matrix, sents_mask, labels],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask],
                                 binarize_prob,
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0
        cost_i = 0.0
        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size])
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]

                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Example #19
0
def evaluate_lenet5(learning_rate=0.08, n_epochs=2000, nkerns=[44], batch_size=1, window_width=3,
                    maxSentLength=64, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=24):
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_nonoverlap.txt', rootPath+'train_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate,maxSentLength, entailment=True)#vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test=load_extra_features(rootPath+'train_rule_features_cosine_eucli_negation_len1_len2.txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2.txt')
    discri_train, discri_test=load_extra_features(rootPath+'train_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]

    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_nonoverlap_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    mts=T.dmatrix()
    extra=T.dmatrix()
    discri=T.dmatrix()
    #wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    

    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    

    
    
    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    
    
    sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    
    linear=Linear(norm_uni_l, norm_uni_r)
    poly=Poly(norm_uni_l, norm_uni_r)
    sigmoid=Sigmoid(norm_uni_l, norm_uni_r)
    rbf=RBF(norm_uni_l, norm_uni_r)
    gesd=GESD(norm_uni_l, norm_uni_r)
    
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([mts,
                                eucli_1,uni_cosine,#linear, poly,sigmoid,rbf, gesd, #sum_uni_r-sum_uni_l,
                                layer1.output_eucli_to_simi,layer1.output_cosine, #layer1.output_vector_r-layer1.output_vector_l,
                                len_l, len_r,
                                extra
                                #discri
                                #wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=14+(2)+(2)+2+5, n_out=3)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y),layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index: index + batch_size],
            extra: extra_test[index: index + batch_size],
            discri:discri_test[index: index + batch_size]
            #wmf: wm_test[index: index + batch_size]
            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params 
    params_conv = [conv_W, conv_b]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_acc=1-test_score
                print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()
                clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                #lr=LinearRegression().fit(train_features, train_y)
                #results_lr=lr.predict(test_features)
                corr_count=0
                #corr_lr=0
                corr_neu=0
                neu_co=0
                corr_ent=0
                ent_co=0
                corr_contr=0
                contr_co=0
                test_size=len(test_y)
                for i in range(test_size):
                    if test_y[i]==0:#NEUTRAL
                        neu_co+=1
                        if results[i]==test_y[i]:
                            corr_neu+=1
                    elif test_y[i]==1:#ENTAILMENT
                        ent_co+=1
                        if results[i]==test_y[i]:
                            corr_ent+=1
                    elif test_y[i]==2:#CONTRADICTION
                        contr_co+=1
                        if results[i]==test_y[i]:
                            corr_contr+=1
                    '''
                    if results[i]==test_y[i]:
                        corr_count+=1
                        if test_y[i]==0: #NEUTRAL
                            corr_neu+=1
                    '''
                        
                    #if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                    #    corr_lr+=1
                corr_count=corr_neu+corr_ent+corr_contr
                acc=corr_count*1.0/test_size
                acc_neu=corr_neu*1.0/neu_co
                acc_ent=corr_ent*1.0/ent_co
                acc_contr=corr_contr*1.0/contr_co
                #acc_lr=corr_lr*1.0/test_size
                if acc > max_acc:
                    max_acc=acc
                    best_epoch=epoch
                if test_acc > max_acc:
                    max_acc=test_acc
                    best_epoch=epoch                 
                #if acc_lr> max_acc:
                #    max_acc=acc_lr
                #    best_epoch=epoch
                print '\t\t\tsvm acc: ', acc, ' max acc: ',    max_acc,'(at',best_epoch,')',' Neu: ',acc_neu, ' Ent: ',acc_ent, ' Contr: ',acc_contr 

            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    batch_size=100,
                    emb_size=300,
                    char_emb_size=20,
                    hidden_size=10,
                    L2_weight=0.0001,
                    p_len_limit=400,
                    test_p_len_limit=100,
                    q_len_limit=20,
                    char_len=15,
                    filter_size=[5, 5, 5, 5, 5],
                    char_filter_size=5,
                    margin=0.85,
                    extra_size=5 + 11,
                    extra_emb=10,
                    distance=10,
                    distance_emb=10,
                    comment='add distance embs'):  #extra_size=3+46+7
    test_batch_size = batch_size
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)

    word2id = {}
    char2id = {}

    train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras, word2id, char2id = load_squad_cnn_rank_span_word_train(
        word2id, char2id, p_len_limit, q_len_limit, char_len)
    test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_span_word_dev(
        word2id, char2id, test_p_len_limit, q_len_limit, char_len)
    '''
    #store variables into file
    '''
    #     train_variables = [train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras]
    #     test_variables =[test_Q_list, test_para_list,  test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list,  test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list]
    #     with open(rootPath+'extra.3.pickle', 'wb') as f:  # Python 3: open(..., 'wb')
    #         cPickle.dump(train_variables+test_variables, f, protocol=cPickle.HIGHEST_PROTOCOL)
    #     f.close()
    #     print 'variable stored successfully'
    #     exit(0)
    '''
    load variables from file
    '''
    #     before_load_time = time.time()
    #     with open(rootPath+'extra.3.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    #         train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras,test_Q_list, test_para_list,  test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list,  test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = cPickle.load(f)
    #     f.close()
    #     print 'load data variables successfully, spend: ',     (time.time()-before_load_time)/60.0, ' mins'
    train_size = len(train_para_list)
    test_size = len(test_para_list)

    train_Q_list = numpy.asarray(train_Q_list, dtype='int32')
    train_para_list = numpy.asarray(train_para_list, dtype='int32')
    train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX)
    train_para_mask = numpy.asarray(train_para_mask,
                                    dtype=theano.config.floatX)

    train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32')
    train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32')
    train_Q_char_mask = numpy.asarray(train_Q_char_mask,
                                      dtype=theano.config.floatX)
    train_para_char_mask = numpy.asarray(train_para_char_mask,
                                         dtype=theano.config.floatX)

    train_para_extras = numpy.asarray(train_para_extras,
                                      dtype=theano.config.floatX)

    train_span_label_list = numpy.asarray(train_span_label_list, dtype='int32')
    train_word_label_list = numpy.asarray(train_word_label_list, dtype='int32')

    test_Q_list = numpy.asarray(test_Q_list, dtype='int32')
    test_para_list = numpy.asarray(test_para_list, dtype='int32')
    test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX)
    test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX)

    test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32')
    test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32')
    test_Q_char_mask = numpy.asarray(test_Q_char_mask,
                                     dtype=theano.config.floatX)
    test_para_char_mask = numpy.asarray(test_para_char_mask,
                                        dtype=theano.config.floatX)

    test_para_extras = numpy.asarray(test_para_extras,
                                     dtype=theano.config.floatX)

    vocab_size = len(word2id)
    print 'vocab size: ', vocab_size
    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX, rng)
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_glove()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    char_size = len(char2id)
    print 'char size: ', char_size
    char_rand_values = random_value_normal((char_size + 1, char_emb_size),
                                           theano.config.floatX, rng)
    char_embeddings = theano.shared(value=char_rand_values, borrow=True)

    extra_rand_values = random_value_normal((extra_size, extra_emb),
                                            theano.config.floatX, rng)
    extra_embeddings = theano.shared(value=extra_rand_values, borrow=True)

    distance_rand_values = random_value_normal(
        (2 * distance + 1, distance_emb), theano.config.floatX, rng)
    distance_embeddings = theano.shared(value=distance_rand_values,
                                        borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    span_indices = T.ivector()  #batch
    word_indices = T.imatrix()  #(batch, 2)
    ans_indices = T.ivector()  # for one batch, the length is dynamic
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')

    extra = T.ftensor3()  #(batch, p_len, 3)

    char_paragraph = T.imatrix()  #(batch, char_len*p_len)
    char_questions = T.imatrix()
    char_para_mask = T.fmatrix()
    char_q_mask = T.fmatrix()

    true_p_len = T.iscalar()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size = paragraph.shape[0]

    extra_rep_batch = T.concatenate(
        [extra.dot(extra_embeddings), extra],
        axis=2)  #(batch, p_len, extra_emb+extra_size)
    zero_pad = T.zeros((true_batch_size, 1, extra_emb + extra_size))
    left_context = T.concatenate([zero_pad, extra_rep_batch[:, :-1, :]],
                                 axis=1)  #(batch, p_len, extra_emb+extra_size)
    right_context = T.concatenate(
        [extra_rep_batch[:, 1:, :], zero_pad],
        axis=1)  #(batch, p_len, extra_emb+extra_size)

    left_context_2 = T.concatenate(
        [zero_pad, zero_pad, extra_rep_batch[:, :-2, :]],
        axis=1)  #(batch, p_len, extra_emb+extra_size)
    right_context_2 = T.concatenate(
        [extra_rep_batch[:, 2:, :], zero_pad, zero_pad],
        axis=1)  #(batch, p_len, extra_emb+extra_size)

    simi2left = T.sum(extra_rep_batch * left_context,
                      axis=2).dimshuffle(0, 1, 'x')  #(batch, p_len, 1)
    simi2right = T.sum(extra_rep_batch * right_context,
                       axis=2).dimshuffle(0, 1, 'x')  #(batch, p_len, 1)
    cos2left = cosine_tensor3(extra_rep_batch, left_context,
                              2).dimshuffle(0, 1, 'x')
    cos2right = cosine_tensor3(extra_rep_batch, right_context,
                               2).dimshuffle(0, 1, 'x')
    diff2left = extra_rep_batch - left_context
    diff2right = extra_rep_batch - right_context  #(batch, p_len, extra_emb+extra_size)
    extra_rep_batch = T.concatenate(
        [
            extra_rep_batch, left_context, right_context, left_context_2,
            right_context_2, diff2left, diff2right, simi2left, simi2right,
            cos2left, cos2right
        ],
        axis=2)  #batch, p_len, 7*(extra_emb+extra_size)+4)
    true_extra_size = 7 * (extra_emb + extra_size) + 4

    common_input_p = embeddings[paragraph.flatten()].reshape(
        (true_batch_size, true_p_len,
         emb_size))  #the input format can be adapted into CNN or GRU or LSTM
    common_input_q = embeddings[questions.flatten()].reshape(
        (true_batch_size, q_len_limit, emb_size))

    char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape(
        (true_batch_size * true_p_len, char_len, char_emb_size
         ))  #the input format can be adapted into CNN or GRU or LSTM
    char_common_input_q = char_embeddings[char_questions.flatten()].reshape(
        (true_batch_size * q_len_limit, char_len, char_emb_size))

    char_p_masks = char_para_mask.reshape(
        (true_batch_size * true_p_len, char_len))
    char_q_masks = char_q_mask.reshape(
        (true_batch_size * q_len_limit, char_len))

    conv_W_char, conv_b_char = create_conv_para(
        rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size))
    conv_W_1, conv_b_1 = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1,
                      emb_size + char_emb_size + true_extra_size,
                      filter_size[0]))
    conv_W_2, conv_b_2 = create_conv_para(rng,
                                          filter_shape=(hidden_size, 1,
                                                        hidden_size,
                                                        filter_size[1]))
    conv_W_3, conv_b_3 = create_conv_para(rng,
                                          filter_shape=(hidden_size, 1,
                                                        hidden_size,
                                                        filter_size[2]))
    #     conv_W_4, conv_b_4=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3]))
    #     conv_W_5, conv_b_5=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4]))

    conv_W_1_q, conv_b_1_q = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1, emb_size + char_emb_size,
                      filter_size[0]))
    conv_W_2_q, conv_b_2_q = create_conv_para(rng,
                                              filter_shape=(hidden_size, 1,
                                                            hidden_size,
                                                            filter_size[1]))
    conv_W_3_q, conv_b_3_q = create_conv_para(rng,
                                              filter_shape=(hidden_size, 1,
                                                            hidden_size,
                                                            filter_size[2]))
    #     conv_W_4_q, conv_b_4_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3]))
    #     conv_W_5_q, conv_b_5_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4]))
    CNN_para = [
        conv_W_1,
        conv_b_1,
        conv_W_2,
        conv_b_2,
        conv_W_1_q,
        conv_b_1_q,
        conv_W_2_q,
        conv_b_2_q,
        conv_W_3,
        conv_b_3,
        conv_W_3_q,
        conv_b_3_q,
        #              conv_W_4, conv_b_4, conv_W_5, conv_b_5,conv_W_4_q, conv_b_4_q, conv_W_5_q, conv_b_5_q,
        conv_W_char,
        conv_b_char
    ]

    span_input4score, word_input4score, overall_span_hidden_size, overall_word_hidden_size = squad_cnn_rank_spans_word(
        rng,
        common_input_p,
        common_input_q,
        char_common_input_p,
        char_common_input_q,
        batch_size,
        p_len_limit,
        q_len_limit,
        emb_size,
        char_emb_size,
        char_len,
        filter_size,
        char_filter_size,
        hidden_size,
        conv_W_1,
        conv_b_1,
        conv_W_2,
        conv_b_2,
        conv_W_1_q,
        conv_b_1_q,
        conv_W_2_q,
        conv_b_2_q,
        conv_W_char,
        conv_b_char,
        conv_W_3,
        conv_b_3,
        conv_W_3_q,
        conv_b_3_q,
        #                         conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q,
        #                         conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q,
        para_mask,
        q_mask,
        char_p_masks,
        char_q_masks,
        extra_rep_batch,
        true_extra_size)

    test_span_input4score, test_word_input4score, _, _ = squad_cnn_rank_spans_word(
        rng,
        common_input_p,
        common_input_q,
        char_common_input_p,
        char_common_input_q,
        test_batch_size,
        test_p_len_limit,
        q_len_limit,
        emb_size,
        char_emb_size,
        char_len,
        filter_size,
        char_filter_size,
        hidden_size,
        conv_W_1,
        conv_b_1,
        conv_W_2,
        conv_b_2,
        conv_W_1_q,
        conv_b_1_q,
        conv_W_2_q,
        conv_b_2_q,
        conv_W_char,
        conv_b_char,
        conv_W_3,
        conv_b_3,
        conv_W_3_q,
        conv_b_3_q,
        #                         conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q,
        #                         conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q,
        para_mask,
        q_mask,
        char_p_masks,
        char_q_masks,
        extra_rep_batch,
        true_extra_size)  #(batch, hidden, gram_size)

    gram_size = 5 * true_p_len - (0 + 1 + 2 + 3 + 4)

    #     U_a = create_ensemble_para(rng, 1, 4*hidden_size)
    #     norm_U_a=normalize_matrix(U_a)
    #     span_scores_matrix=T.dot(span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((batch_size, gram_size))  #(batch, 13*para_len-78, 1)

    span_HL_1_para = create_ensemble_para(rng, hidden_size,
                                          overall_span_hidden_size)
    span_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    span_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    span_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    span_U_a = create_ensemble_para(rng, 1,
                                    hidden_size + overall_span_hidden_size)
    norm_span_U_a = normalize_matrix(span_U_a)
    norm_span_HL_1_para = normalize_matrix(span_HL_1_para)
    norm_span_HL_2_para = normalize_matrix(span_HL_2_para)
    norm_span_HL_3_para = normalize_matrix(span_HL_3_para)
    norm_span_HL_4_para = normalize_matrix(span_HL_4_para)

    span_scores_matrix = add_HLs_2_tensor3(span_input4score,
                                           norm_span_HL_1_para,
                                           norm_span_HL_2_para,
                                           norm_span_HL_3_para,
                                           norm_span_HL_4_para, norm_span_U_a,
                                           batch_size, gram_size)

    span_scores = T.nnet.softmax(span_scores_matrix)  #(batch, 7*para_len-21)
    loss_neg_likelihood = -T.mean(
        T.log(span_scores[T.arange(batch_size), span_indices]))

    #ranking loss
    tanh_span_scores_matrix = span_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)

    index_matrix = T.zeros((batch_size, gram_size), dtype=theano.config.floatX)
    new_index_matrix = T.set_subtensor(
        index_matrix[T.arange(batch_size), span_indices], 1.0)

    prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()]
    prob_batch_nega = tanh_span_scores_matrix[(1.0 -
                                               new_index_matrix).nonzero()]

    repeat_posi = T.extra_ops.repeat(prob_batch_posi,
                                     prob_batch_nega.shape[0],
                                     axis=0)
    repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0),
                                     prob_batch_posi.shape[0],
                                     axis=0).flatten()
    loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega))

    span_loss = loss_neg_likelihood + loss_rank

    #     test_span_scores_matrix=T.dot(test_span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((true_batch_size, gram_size))  #(batch, 13*para_len-78)
    test_span_scores_matrix = add_HLs_2_tensor3(
        test_span_input4score, norm_span_HL_1_para, norm_span_HL_2_para,
        norm_span_HL_3_para, norm_span_HL_4_para, norm_span_U_a,
        true_batch_size, gram_size)

    #word
    HL_1_para = create_ensemble_para(rng, hidden_size,
                                     overall_word_hidden_size)
    HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    start_U_a = create_ensemble_para(rng, 1,
                                     hidden_size + overall_word_hidden_size)
    norm_start_U_a = normalize_matrix(start_U_a)
    norm_HL_1_para = normalize_matrix(HL_1_para)
    norm_HL_2_para = normalize_matrix(HL_2_para)
    norm_HL_3_para = normalize_matrix(HL_3_para)
    norm_HL_4_para = normalize_matrix(HL_4_para)

    end_HL_1_para = create_ensemble_para(
        rng, hidden_size, overall_word_hidden_size + distance_emb)
    end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_U_a = create_ensemble_para(
        rng, 1, hidden_size + overall_word_hidden_size + distance_emb)
    end_norm_U_a = normalize_matrix(end_U_a)
    end_norm_HL_1_para = normalize_matrix(end_HL_1_para)
    end_norm_HL_2_para = normalize_matrix(end_HL_2_para)
    end_norm_HL_3_para = normalize_matrix(end_HL_3_para)
    end_norm_HL_4_para = normalize_matrix(end_HL_4_para)

    start_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_HL_1_para,
                                            norm_HL_2_para, norm_HL_3_para,
                                            norm_HL_4_para, norm_start_U_a,
                                            batch_size, true_p_len)
    start_scores = T.nnet.softmax(start_scores_matrix)  #(batch, para_len)
    '''
    forward start info to end prediction
    '''
    distance_matrix = word_indices[:, 0].dimshuffle(
        0, 'x') - T.arange(true_p_len).dimshuffle('x', 0)  #(batch, p_len)
    distance_trunc_matrix = T.maximum(
        -distance, T.minimum(distance,
                             distance_matrix)) + distance  #(batch, p_len)
    zero_distance_matrix = T.zeros(
        (true_batch_size * true_p_len, 2 * distance + 1))
    filled_distance_matrix = T.set_subtensor(
        zero_distance_matrix[T.arange(true_batch_size * true_p_len),
                             distance_trunc_matrix.flatten()], 1.0)
    filled_distance_tensor3 = filled_distance_matrix.reshape(
        (true_batch_size,
         true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle(
             0, 2, 1)  #(batch_size, distance_emb, p_len)

    end_word_input4score = T.concatenate(
        [word_input4score, filled_distance_tensor3],
        axis=1)  #(batch, +distance_emb, p_len)

    end_scores_matrix = add_HLs_2_tensor3(end_word_input4score,
                                          end_norm_HL_1_para,
                                          end_norm_HL_2_para,
                                          end_norm_HL_3_para,
                                          end_norm_HL_4_para, end_norm_U_a,
                                          batch_size, true_p_len)
    end_scores = T.nnet.softmax(end_scores_matrix)  #(batch, para_len)
    start_loss_neg_likelihood = -T.mean(
        T.log(start_scores[T.arange(batch_size), word_indices[:, 0]]))
    end_loss_neg_likelihood = -T.mean(
        T.log(end_scores[T.arange(batch_size), word_indices[:, 1]]))

    #ranking loss start
    tanh_start_scores_matrix = start_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)
    start_index_matrix = T.zeros((batch_size, p_len_limit),
                                 dtype=theano.config.floatX)
    start_new_index_matrix = T.set_subtensor(
        start_index_matrix[T.arange(batch_size), word_indices[:, 0]], 1.0)
    start_prob_batch_posi = tanh_start_scores_matrix[
        start_new_index_matrix.nonzero()]
    start_prob_batch_nega = tanh_start_scores_matrix[(
        1.0 - start_new_index_matrix).nonzero()]
    start_repeat_posi = T.extra_ops.repeat(start_prob_batch_posi,
                                           start_prob_batch_nega.shape[0],
                                           axis=0)
    start_repeat_nega = T.extra_ops.repeat(start_prob_batch_nega.dimshuffle(
        'x', 0),
                                           start_prob_batch_posi.shape[0],
                                           axis=0).flatten()
    start_loss_rank = T.mean(
        T.maximum(0.0, margin - start_repeat_posi + start_repeat_nega))

    #ranking loss END
    end_tanh_scores_matrix = end_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)
    end_index_matrix = T.zeros((batch_size, p_len_limit),
                               dtype=theano.config.floatX)
    end_new_index_matrix = T.set_subtensor(
        end_index_matrix[T.arange(batch_size), word_indices[:, 1]], 1.0)
    end_prob_batch_posi = end_tanh_scores_matrix[
        end_new_index_matrix.nonzero()]
    end_prob_batch_nega = end_tanh_scores_matrix[(
        1.0 - end_new_index_matrix).nonzero()]
    end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi,
                                         end_prob_batch_nega.shape[0],
                                         axis=0)
    end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle(
        'x', 0),
                                         end_prob_batch_posi.shape[0],
                                         axis=0).flatten()
    end_loss_rank = T.mean(
        T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega))

    word_loss = start_loss_neg_likelihood + end_loss_neg_likelihood + start_loss_rank + end_loss_rank

    #test
    test_start_scores_matrix = add_HLs_2_tensor3(
        test_word_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para,
        norm_HL_4_para, norm_start_U_a, true_batch_size,
        true_p_len)  #(batch, test_p_len)
    mask_test_start_return = test_start_scores_matrix * para_mask  #(batch, p_len)
    '''
    forward start info to end prediction in testing
    '''
    test_distance_matrix = T.argmax(mask_test_start_return, axis=1).dimshuffle(
        0, 'x') - T.arange(true_p_len).dimshuffle('x', 0)  #(batch, p_len)
    test_distance_trunc_matrix = T.maximum(
        -distance, T.minimum(distance,
                             test_distance_matrix)) + distance  #(batch, p_len)
    test_zero_distance_matrix = T.zeros(
        (true_batch_size * true_p_len, 2 * distance + 1))
    test_filled_distance_matrix = T.set_subtensor(
        test_zero_distance_matrix[T.arange(true_batch_size * true_p_len),
                                  test_distance_trunc_matrix.flatten()], 1.0)
    test_filled_distance_tensor3 = test_filled_distance_matrix.reshape(
        (true_batch_size,
         true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle(
             0, 2, 1)  #(batch_size, distance_emb, p_len)
    test_end_word_input4score = T.concatenate(
        [test_word_input4score, test_filled_distance_tensor3],
        axis=1)  #(batch, +distance-emb, p_len)

    end_test_scores_matrix = add_HLs_2_tensor3(
        test_end_word_input4score, end_norm_HL_1_para, end_norm_HL_2_para,
        end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size,
        true_p_len)  #(batch, test_p_len)
    end_mask_test_return = end_test_scores_matrix * para_mask  #(batch, p_len)

    word_gram_1 = mask_test_start_return + end_mask_test_return
    word_gram_2 = mask_test_start_return[:, :
                                         -1] + end_mask_test_return[:,
                                                                    1:]  #(batch* hidden_size, maxsenlen-1)
    word_gram_3 = mask_test_start_return[:, :
                                         -2] + end_mask_test_return[:,
                                                                    2:]  #(batch* hidden_size, maxsenlen-2)
    word_gram_4 = mask_test_start_return[:, :
                                         -3] + end_mask_test_return[:,
                                                                    3:]  #(batch* hidden_size, maxsenlen-3)
    word_gram_5 = mask_test_start_return[:, :
                                         -4] + end_mask_test_return[:,
                                                                    4:]  #(batch* hidden_size, maxsenlen-4)
    word_pair_scores = T.concatenate(
        [word_gram_1, word_gram_2, word_gram_3, word_gram_4, word_gram_5],
        axis=1)  #(batch_size, gram_size)

    #ans words train
    ans_HL_1_para = create_ensemble_para(rng, hidden_size,
                                         overall_word_hidden_size)
    ans_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    ans_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    ans_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    ans_U_a = create_ensemble_para(rng, 1,
                                   hidden_size + overall_word_hidden_size)
    norm_ans_U_a = normalize_matrix(ans_U_a)
    norm_ans_HL_1_para = normalize_matrix(ans_HL_1_para)
    norm_ans_HL_2_para = normalize_matrix(ans_HL_2_para)
    norm_ans_HL_3_para = normalize_matrix(ans_HL_3_para)
    norm_ans_HL_4_para = normalize_matrix(ans_HL_4_para)

    ans_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_ans_HL_1_para,
                                          norm_ans_HL_2_para,
                                          norm_ans_HL_3_para,
                                          norm_ans_HL_4_para, norm_ans_U_a,
                                          batch_size, true_p_len)
    ans_scores_vec = T.nnet.softmax(
        ans_scores_matrix).flatten()  #(batch, para_len)
    ans_loss_neg_likelihood = -T.mean(T.log(ans_scores_vec[ans_indices]))

    ans_index_vec = T.zeros((batch_size, p_len_limit),
                            dtype=theano.config.floatX).flatten()
    ans_new_index = T.set_subtensor(ans_index_vec[ans_indices], 1.0)
    ans_prob_batch_posi = ans_scores_vec[ans_new_index.nonzero()]
    ans_prob_batch_nega = ans_scores_vec[(1.0 - ans_new_index).nonzero()]
    ans_repeat_posi = T.extra_ops.repeat(ans_prob_batch_posi,
                                         ans_prob_batch_nega.shape[0],
                                         axis=0)
    ans_repeat_nega = T.extra_ops.repeat(ans_prob_batch_nega.dimshuffle(
        'x', 0),
                                         ans_prob_batch_posi.shape[0],
                                         axis=0).flatten()
    ans_loss_rank = T.mean(
        T.maximum(0.0, margin - ans_repeat_posi + ans_repeat_nega))

    ans_loss = ans_loss_neg_likelihood + ans_loss_rank
    #ans words test
    test_ans_scores_matrix = add_HLs_2_tensor3(
        test_word_input4score, norm_ans_HL_1_para, norm_ans_HL_2_para,
        norm_ans_HL_3_para, norm_ans_HL_4_para, norm_ans_U_a, true_batch_size,
        true_p_len)
    test_ans_scores_matrix = test_ans_scores_matrix * para_mask  #T.nnet.softmax(test_ans_scores_matrix) #(batch, para_len)
    ans_gram_1 = test_ans_scores_matrix
    ans_gram_2 = (test_ans_scores_matrix[:, :-1] +
                  test_ans_scores_matrix[:, 1:]
                  ) / 2.0  #(batch* hidden_size, maxsenlen-1)
    ans_gram_3 = (test_ans_scores_matrix[:, :-2] +
                  test_ans_scores_matrix[:, 1:-1] +
                  test_ans_scores_matrix[:, 2:]
                  ) / 3.0  #(batch* hidden_size, maxsenlen-2)
    ans_gram_4 = (
        test_ans_scores_matrix[:, :-3] + test_ans_scores_matrix[:, 1:-2] +
        test_ans_scores_matrix[:, 2:-1] + test_ans_scores_matrix[:, 3:]
    ) / 4.0  #(batch* hidden_size, maxsenlen-3)
    ans_gram_5 = (
        test_ans_scores_matrix[:, :-4] + test_ans_scores_matrix[:, 1:-3] +
        test_ans_scores_matrix[:, 2:-2] + test_ans_scores_matrix[:, 3:-1] +
        test_ans_scores_matrix[:,
                               4:]) / 5.0  #(batch* hidden_size, maxsenlen-4)
    ans_word_scores = T.concatenate(
        [ans_gram_1, ans_gram_2, ans_gram_3, ans_gram_4, ans_gram_5],
        axis=1)  #(batch, hidden_size, maxsenlen-(0+1+2+3+4))
    '''
    form test spans and masks
    '''
    test_span_word_scores_matrix = word_pair_scores + ans_word_scores  #test_span_scores_matrix+
    test_spans_mask_1 = para_mask
    test_spans_mask_2 = para_mask[:, :
                                  -1] * para_mask[:,
                                                  1:]  #(batch* hidden_size, maxsenlen-1)
    test_spans_mask_3 = para_mask[:, :
                                  -2] * para_mask[:, 1:
                                                  -1] * para_mask[:,
                                                                  2:]  #(batch* hidden_size, maxsenlen-2)
    test_spans_mask_4 = para_mask[:, :
                                  -3] * para_mask[:, 1:
                                                  -2] * para_mask[:, 2:
                                                                  -1] * para_mask[:,
                                                                                  3:]  #(batch* hidden_size, maxsenlen-3)
    test_spans_mask_5 = para_mask[:, :
                                  -4] * para_mask[:, 1:
                                                  -3] * para_mask[:, 2:
                                                                  -2] * para_mask[:,
                                                                                  3:
                                                                                  -1] * para_mask[:,
                                                                                                  4:]
    test_spans_mask = T.concatenate([
        test_spans_mask_1, test_spans_mask_2, test_spans_mask_3,
        test_spans_mask_4, test_spans_mask_5
    ],
                                    axis=1)  #(batch, 5*p_len -)
    #     test_return=T.argmax(test_span_word_scores_matrix, axis=1) #batch T.argmax(test_span_word_scores_matrix*test_spans_mask, axis=1) #batch
    test_return = T.argmax(test_span_word_scores_matrix * test_spans_mask,
                           axis=1)  #batch

    #     params = [embeddings,char_embeddings]+NN_para+[U_a]
    params = (
        [embeddings, char_embeddings, extra_embeddings, distance_embeddings] +
        CNN_para
        #               +[span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para]
        + [start_U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] +
        [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para] +
        [ans_U_a, ans_HL_1_para, ans_HL_2_para, ans_HL_3_para, ans_HL_4_para])

    L2_reg = L2norm_paraList([
        embeddings,
        char_embeddings,
        extra_embeddings,
        distance_embeddings,
        conv_W_1,
        conv_W_2,
        conv_W_1_q,
        conv_W_2_q,
        conv_W_char,
        conv_W_3,
        conv_W_3_q,
        #     conv_W_4, conv_W_5,conv_W_4_q, conv_W_5_q,
        #     span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para,
        start_U_a,
        HL_1_para,
        HL_2_para,
        HL_3_para,
        HL_4_para,
        end_U_a,
        end_HL_1_para,
        end_HL_2_para,
        end_HL_3_para,
        end_HL_4_para,
        ans_U_a,
        ans_HL_1_para,
        ans_HL_2_para,
        ans_HL_3_para,
        ans_HL_4_para
    ])
    #L2_reg = L2norm_paraList(params)
    cost = word_loss + ans_loss + L2_weight * L2_reg  #span_loss+

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function([
        paragraph, questions, span_indices, word_indices, ans_indices,
        para_mask, q_mask, extra, char_paragraph, char_questions,
        char_para_mask, char_q_mask, true_p_len
    ],
                                  cost,
                                  updates=updates,
                                  on_unused_input='ignore')

    test_model = theano.function([
        paragraph, questions, para_mask, q_mask, extra, char_paragraph,
        char_questions, char_para_mask, char_q_mask, true_p_len
    ],
                                 test_return,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / test_batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(numpy.arange(n_test_batches) *
                            test_batch_size) + [test_size - test_batch_size]

    max_F1_acc = 0.0
    max_exact_acc = 0.0
    cost_i = 0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.Random(200).shuffle(train_ids)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_ids[para_id:para_id + batch_size]
            boundary_labels_batch = train_word_label_list[train_id_batch]
            ans_label_list = []
            for i in range(batch_size):
                start = boundary_labels_batch[i][0] + i * p_len_limit
                end = boundary_labels_batch[i][1] + i * p_len_limit
                ans_label_list += range(start, end + 1)
            ans_label_list = numpy.asarray(ans_label_list, dtype='int32')

            cost_i += train_model(
                train_para_list[train_id_batch], train_Q_list[train_id_batch],
                train_span_label_list[train_id_batch], boundary_labels_batch,
                ans_label_list, train_para_mask[train_id_batch],
                train_Q_mask[train_id_batch],
                train_para_extras[train_id_batch],
                train_para_char_list[train_id_batch],
                train_Q_char_list[train_id_batch],
                train_para_char_mask[train_id_batch],
                train_Q_char_mask[train_id_batch], p_len_limit)

            #print iter
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                pred_dict = {}
                q_amount = 0
                for test_para_id in test_batch_start:
                    batch_predict_ids = test_model(
                        test_para_list[test_para_id:test_para_id +
                                       test_batch_size],
                        test_Q_list[test_para_id:test_para_id +
                                    test_batch_size],
                        test_para_mask[test_para_id:test_para_id +
                                       test_batch_size],
                        test_Q_mask[test_para_id:test_para_id +
                                    test_batch_size],
                        test_para_extras[test_para_id:test_para_id +
                                         test_batch_size],
                        test_para_char_list[test_para_id:test_para_id +
                                            test_batch_size],
                        test_Q_char_list[test_para_id:test_para_id +
                                         test_batch_size],
                        test_para_char_mask[test_para_id:test_para_id +
                                            test_batch_size],
                        test_Q_char_mask[test_para_id:test_para_id +
                                         test_batch_size], test_p_len_limit)
                    test_para_wordlist_batch = test_para_wordlist_list[
                        test_para_id:test_para_id + test_batch_size]
                    q_ids_batch = q_idlist[test_para_id:test_para_id +
                                           test_batch_size]
                    q_amount += test_batch_size

                    for q in range(test_batch_size):  #for each question
                        pred_ans = decode_predict_id(
                            batch_predict_ids[q], test_para_wordlist_batch[q])
                        q_id = q_ids_batch[q]
                        pred_dict[q_id] = pred_ans
#                         print q_id, test_para_wordlist_batch[q],'\t',pred_ans
                with codecs.open(rootPath + 'predictions.txt', 'w',
                                 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json',
                                                  rootPath + 'predictions.txt')
                if F1_acc > max_F1_acc:
                    max_F1_acc = F1_acc
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc


#                     if max_exact_acc > max_EM:
#                         store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params)
#                         print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    return max_exact_acc
Example #21
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=70,
                    filter_size=[3],
                    maxSentLen=40,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered"
    "words keep random initialization"
    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in'
    'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) '
    embed_input_l = init_embeddings[sents_ids_l.flatten(
    )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)
    '''create parameters for attentive convolution function '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    "Logistic Regression layer"
    LR_input = T.concatenate([
        attentive_sent_embeddings_l, attentive_sent_embeddings_r,
        attentive_sent_embeddings_l + attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = 4 * hidden_size[0]

    U_a = create_ensemble_para(rng, 3, LR_input_size)  # (input_size, 3)
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng,
        input=normalize_matrix_col_wise(LR_input),
        n_in=LR_input_size,
        n_out=3,
        W=U_a,
        b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [init_embeddings] + NN_para + LR_para
    cost = loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 1000 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                '''
                test
                '''
                error_sum = 0.0
                for test_batch_id in test_batch_start:  # for each test batch
                    error_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                test_acc = 1.0 - error_sum / (len(test_batch_start))

                if test_acc > max_acc_test:
                    max_acc_test = test_acc
                print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300,
                    L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5],
                    char_filter_size=3, margin=2.0, max_EM=50.302743615):
    test_batch_size=batch_size*10
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)


    word2id={}
    char2id={}
    #questions,paragraphs,q_masks,p_masks,labels, word2id
    train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len)
    train_size=len(train_para_list)

    test_Q_list, test_para_list,  test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list,  test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len)
    test_size=len(test_para_list)

    train_Q_list = numpy.asarray(train_Q_list, dtype='int32')
    train_para_list = numpy.asarray(train_para_list, dtype='int32')
    train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX)
    train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX)

    train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32')
    train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32')
    train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX)
    train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX)

    train_label_list = numpy.asarray(train_label_list, dtype='int32')

    test_Q_list = numpy.asarray(test_Q_list, dtype='int32')
    test_para_list = numpy.asarray(test_para_list, dtype='int32')
    test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX)
    test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX)

    test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32')
    test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32')
    test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX)
    test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX)



    vocab_size = len(word2id)
    print 'vocab size: ', vocab_size
    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng)
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_glove()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)

    char_size = len(char2id)
    print 'char size: ', char_size
    char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    gold_indices= T.imatrix() #batch, (start, end) for each sample
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')

    char_paragraph = T.imatrix() #(batch, char_len*p_len)
    char_questions = T.imatrix()
    char_para_mask=T.fmatrix()
    char_q_mask=T.fmatrix()

    true_p_len = T.iscalar()



    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size = paragraph.shape[0]

    common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM
    common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size))


    char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM
    char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size))

    char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len))
    char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len))

    conv_W_char, conv_b_char=create_conv_para(rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size))
    conv_W_1, conv_b_1=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0]))
    conv_W_2, conv_b_2=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1]))

    conv_W_1_q, conv_b_1_q=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0]))
    conv_W_2_q, conv_b_2_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1]))
    NN_para=[conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char]

    input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,batch_size, p_len_limit,q_len_limit,
                         emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size,
                         conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char,
                         para_mask, q_mask, char_p_masks,char_q_masks)

    test_input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,test_batch_size, test_p_len_limit,q_len_limit,
                         emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size,
                         conv_W_1, conv_b_1,conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char,
                         para_mask, q_mask, char_p_masks,char_q_masks)  #(batch, hidden, #(batch, 2*hidden, p_len_limit))

    # gram_size = 5*true_p_len-(0+1+2+3+4)


    HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size)
    HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 1, hidden_size)
    norm_U_a=normalize_matrix(U_a)
    norm_HL_1_para=normalize_matrix(HL_1_para)
    norm_HL_2_para=normalize_matrix(HL_2_para)
    norm_HL_3_para=normalize_matrix(HL_3_para)
    norm_HL_4_para=normalize_matrix(HL_4_para)

    end_HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size)
    end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_U_a = create_ensemble_para(rng, 1, hidden_size)
    end_norm_U_a=normalize_matrix(end_U_a)
    end_norm_HL_1_para=normalize_matrix(end_HL_1_para)
    end_norm_HL_2_para=normalize_matrix(end_HL_2_para)
    end_norm_HL_3_para=normalize_matrix(end_HL_3_para)
    end_norm_HL_4_para=normalize_matrix(end_HL_4_para)

    span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, batch_size,true_p_len)
    span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len)
    end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para, end_norm_U_a, batch_size,true_p_len)
    end_span_scores=T.nnet.softmax(end_span_scores_matrix) #(batch, para_len)
    loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,0]]))
    end_loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,1]]))

    #ranking loss start
    tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size)
    index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX)
    new_index_matrix = T.set_subtensor(index_matrix[T.arange(batch_size), gold_indices[:,0]], 1.0)
    prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()]
    prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()]
    repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0)
    repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten()
    loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega))

    #ranking loss END
    end_tanh_span_scores_matrix = end_span_scores#T.tanh(span_scores_matrix) #(batch, gram_size)
    end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX)
    end_new_index_matrix = T.set_subtensor(end_index_matrix[T.arange(batch_size), gold_indices[:,1]], 1.0)
    end_prob_batch_posi = end_tanh_span_scores_matrix[end_new_index_matrix.nonzero()]
    end_prob_batch_nega = end_tanh_span_scores_matrix[(1.0-end_new_index_matrix).nonzero()]
    end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0)
    end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle('x',0), end_prob_batch_posi.shape[0], axis=0).flatten()
    end_loss_rank = T.mean(T.maximum(0.0, margin-end_repeat_posi+end_repeat_nega))






    loss = loss_neg_likelihood +end_loss_neg_likelihood+loss_rank+end_loss_rank

    #test
    test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para,norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len)
    mask_test_return=T.argmax(test_span_scores_matrix*para_mask, axis=1) #batch

    end_test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para,end_norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len)
    end_mask_test_return=T.argmax(end_test_span_scores_matrix*para_mask, axis=1) #batch



    params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]+[end_U_a,end_HL_1_para,end_HL_2_para,end_HL_3_para,end_HL_4_para]

    L2_reg =L2norm_paraList([embeddings,char_embeddings,conv_W_1,conv_W_2,conv_W_1_q, conv_W_2_q, conv_W_char,U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para])
    #L2_reg = L2norm_paraList(params)
    cost=loss#+L2_weight*L2_reg


    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask,    char_paragraph, #(batch, char_len*p_len)
        char_questions, char_para_mask, char_q_mask, true_p_len], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([paragraph, questions,para_mask, q_mask,
        char_paragraph,
        char_questions,
        char_para_mask,
        char_q_mask,
                true_p_len], [mask_test_return,end_mask_test_return], on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/test_batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size]


    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_ids[para_id:para_id+batch_size]
            cost_i+= train_model(
                                 train_para_list[train_id_batch],
                                 train_Q_list[train_id_batch],
                                 train_label_list[train_id_batch],
                                 train_para_mask[train_id_batch],
                                 train_Q_mask[train_id_batch],
                                 train_para_char_list[train_id_batch],
                                 train_Q_char_list[train_id_batch],
                                 train_para_char_mask[train_id_batch],
                                 train_Q_char_mask[train_id_batch],
                                 p_len_limit)


            #print iter
            if iter%100==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                pred_dict={}
                q_amount=0
                p1=0
                for test_para_id in test_batch_start:
                    batch_predict_ids, batch_predict_end_ids=test_model(
                                                 test_para_list[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_list[test_para_id:test_para_id+test_batch_size],
                                                 test_para_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_para_char_list[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_char_list[test_para_id:test_para_id+test_batch_size],
                                                 test_para_char_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_char_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_p_len_limit)
                    test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size]
#                     test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size]
#                     q_amount+=test_batch_size
                    q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size]
                    q_amount+=test_batch_size

                    for q in range(test_batch_size): #for each question
#                         pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q])

                        start = batch_predict_ids[q]
                        end = batch_predict_end_ids[q]
                        if end < start:
                            start, end = end, start
                        pred_ans = ' '.join(test_para_wordlist_batch[q][start:end+1])
                        q_id=q_ids_batch[q]
                        pred_dict[q_id]=pred_ans
                with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt')
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
#                     if max_exact_acc > max_EM:
#                         store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params)
#                         print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc






            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    emb_size=40,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    # bow_mean_emb = bow_emb/T.sum(sents_mask,axis=1).dimshuffle(0,'x')

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]

    # NN_para = multiCNN_para+ACNN_para

    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 12, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)

    LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1)
    LR_att_input_size = hidden_size[0] + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_att_a = create_ensemble_para(
        rng, 12, LR_att_input_size)  # the weight matrix hidden_size*2
    LR_att_b = theano.shared(value=np.zeros((12, ),
                                            dtype=theano.config.floatX),
                             name='LR_b',
                             borrow=True)  #bias for each target class
    LR_att_para = [U_att_a, LR_att_b]
    layer_att_LR = LogisticRegression(
        rng,
        input=LR_att_input,
        n_in=LR_att_input_size,
        n_out=12,
        W=U_att_a,
        b=LR_att_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax)  #batch * 12
    att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix,
                           att_score_matrix)

    att_loss = -T.mean(T.log(att_prob_pos))
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    acnn_LR_input = T.concatenate(
        [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1)
    acnn_LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a = create_ensemble_para(
        rng, 12, acnn_LR_input_size)  # the weight matrix hidden_size*2
    acnn_LR_b = theano.shared(value=np.zeros((12, ),
                                             dtype=theano.config.floatX),
                              name='LR_b',
                              borrow=True)  #bias for each target class
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)

    acnn_loss = -T.mean(T.log(acnn_prob_pos))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)

    params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para  # put all model parameters together
    cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() +
                                                 (conv_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = T.max(T.concatenate([
        att_score_matrix.dimshuffle('x', 0, 1),
        score_matrix.dimshuffle('x', 0, 1),
        acnn_score_matrix.dimshuffle('x', 0, 1)
    ],
                                             axis=0),
                               axis=0)
    ensemble_scores = 0.5 * ensemble_NN_scores + 0.5 * cosine_score_matrix
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        binarize_prob,
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    cost_i = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch], label_sent,
                                  label_mask)

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #24
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=1e-5,
                    extra_size=4,
                    emb_size=300,
                    batch_size=100,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    max_term_len=4,
                    p_mode='conc'):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1, all_word2, all_word1_mask, all_word2_mask, all_labels, all_extra, word2id = load_wordnet_hyper_vs_all_with_words(
        maxlen=maxSentLen, wordlen=max_term_len
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1, test_word2, test_word1_mask, test_word2_mask, test_labels, test_extra, word2id = load_task_hyper_vs_all_with_words(
        LenciBenotto_file, maxSentLen, word2id, wordlen=max_term_len)
    store_word2id(word2id, root_dic + 'LenciBenotto_word2id.pkl')
    # exit(0)
    total_size = len(all_sentences_l)
    hold_test_size = 10000
    train_size = total_size - hold_test_size

    train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_word1 = np.asarray(all_word1[:train_size], dtype='int32')
    train_word2 = np.asarray(all_word2[:train_size], dtype='int32')
    test_word1 = np.asarray(test_word1, dtype='int32')
    test_word2 = np.asarray(test_word2, dtype='int32')

    train_word1_mask = np.asarray(all_word1_mask[:train_size],
                                  dtype=theano.config.floatX)
    train_word2_mask = np.asarray(all_word2_mask[:train_size],
                                  dtype=theano.config.floatX)
    test_word1_mask = np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask = np.asarray(test_word2_mask, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[:train_size], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    train_extra = np.asarray(all_extra[:train_size],
                             dtype=theano.config.floatX)
    test_extra = np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable
    store_model_to_file(root_dic + 'LenciBenotto_best_para_init_embeddings',
                        [init_embeddings])

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape(
        (batch_size, word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape(
        (batch_size, word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1 * word1_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    word2_embedding = T.sum(embed_word2 * word2_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=word1_mask,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([
        tt_embeddings_l, tt_embeddings_r, tt_embeddings_l * tt_embeddings_r,
        tt_embeddings_l - tt_embeddings_r
    ],
                         axis=1)

    term_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_input_r,
        mask_matrix=word1_mask,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([
        td_embeddings_l, td_embeddings_r, td_embeddings_l * td_embeddings_r,
        td_embeddings_l - td_embeddings_r
    ],
                         axis=1)

    def_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=sents_mask_l,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([
        dt_embeddings_l, dt_embeddings_r, dt_embeddings_l * dt_embeddings_r,
        dt_embeddings_l - dt_embeddings_r
    ],
                         axis=1)

    def_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([
        dd_embeddings_l, dd_embeddings_r, dd_embeddings_l * dd_embeddings_r,
        dd_embeddings_l - dd_embeddings_r
    ],
                         axis=1)

    if p_mode == 'conc':
        p = T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4 * 4 * hidden_size[1]
    else:
        p = T.max(T.concatenate([
            p_ww.dimshuffle('x', 0, 1),
            p_wd.dimshuffle('x', 0, 1),
            p_dw.dimshuffle('x', 0, 1),
            p_dd.dimshuffle('x', 0, 1)
        ],
                                axis=0),
                  axis=0)
        p_len = 4 * hidden_size[1]

    # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    # HL_input_size=p_len+1+1
    #
    # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh)

    "form input to LR classifier"
    LR_input = T.concatenate([
        p,
        cosine_matrix1_matrix2_rowwise(word1_embedding,
                                       word2_embedding).dimshuffle(0, 'x'),
        extra.dimshuffle(0, 'x')
    ],
                             axis=1)
    LR_input_size = p_len + 1 + 1
    # LR_input = HL_layer_1.output
    # LR_input_size = hidden_size[1]

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng,
        input=LR_input,
        n_in=LR_input_size,
        n_out=2,
        W=U_a,
        b=LR_b,
        bias=0.25
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()

    params = NN_para + LR_para  #[init_embeddings]

    cost = loss  #+L2_weight*L2_reg

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra, labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    test_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra, labels
    ], [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    if n_test_remain != 0:
        test_batch_start = list(
            np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    else:
        test_batch_start = list(np.arange(n_test_batches) * batch_size)

    # max_acc_dev=0.0
    max_ap_test = 0.0
    max_ap_topk_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(
                train_sents_l[train_id_batch], train_masks_l[train_id_batch],
                train_sents_r[train_id_batch], train_masks_r[train_id_batch],
                train_word1[train_id_batch], train_word2[train_id_batch],
                train_word1_mask[train_id_batch],
                train_word2_mask[train_id_batch], train_extra[train_id_batch],
                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                probs = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    error_i, pred_i, prob_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_word1[test_batch_id:test_batch_id + batch_size],
                        test_word2[test_batch_id:test_batch_id + batch_size],
                        test_word1_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_word2_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_extra[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                    pred_labels += list(pred_i)
                    probs += list(prob_i)
                if n_test_remain != 0:
                    probs = probs[:(len(test_batch_start) - 1) *
                                  batch_size] + probs[-n_test_remain:]
                assert len(test_labels) == len(probs)
                # test_acc=1.0-error_sum/(len(test_batch_start))
                test_ap = apk(test_labels, probs, k=len(test_labels))
                test_ap_top100 = apk(test_labels, probs, k=100)

                if test_ap > max_ap_test:
                    max_ap_test = test_ap
                    store_model_to_file(
                        root_dic + 'LenciBenotto_best_para_' +
                        str(max_ap_test), params)
                if test_ap_top100 > max_ap_topk_test:
                    max_ap_topk_test = test_ap_top100
                print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(term1_str, term2_str):
    emb_size=300
    filter_size=[3,3]
    maxSentLen=40
    hidden_size=[300,300]
    max_term_len=4
    p_mode = 'conc'
    batch_size = 1

    term1_def, source1 = load_concept_def(term1_str)
    print '\n',term1_str, ':\t', term1_def,'\t', source1,'\n'
    term2_def, source2 = load_concept_def(term2_str)
    print '\n',term2_str, ':\t', term2_def, '\t', source2,'\n'
    # exit(0)

    word2id = load_word2id('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_word2id.pkl')
    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results

    # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id  =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len)  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id = load_EVAlution_hyper_vs_all_with_words(maxSentLen, word2id, wordlen=max_term_len)
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask, test_extra, word2id = parse_individual_termPair(term1_str, term2_str, term1_def, term2_def, maxSentLen, word2id, wordlen=max_term_len)
    # total_size = len(all_sentences_l)
    # hold_test_size = 10000
    # train_size = total_size - hold_test_size



    # train_sents_l=np.asarray(all_sentences_l[:train_size], dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32')
    test_sents_l=np.asarray(test_sents_l, dtype='int32')

    # train_masks_l=np.asarray(all_masks_l[:train_size], dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX)
    test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX)

    # train_sents_r=np.asarray(all_sentences_r[:train_size], dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32')
    test_sents_r=np.asarray(test_sents_r, dtype='int32')

    # train_masks_r=np.asarray(all_masks_r[:train_size], dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX)
    test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX)

    # train_word1=np.asarray(all_word1[:train_size], dtype='int32')
    # train_word2=np.asarray(all_word2[:train_size], dtype='int32')
    test_word1=np.asarray(test_word1, dtype='int32')
    test_word2=np.asarray(test_word2, dtype='int32')

    # train_word1_mask=np.asarray(all_word1_mask[:train_size], dtype=theano.config.floatX)
    # train_word2_mask=np.asarray(all_word2_mask[:train_size], dtype=theano.config.floatX)
    test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX)

    # train_labels_store=np.asarray(all_labels[:train_size], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32')
    # test_labels_store=np.asarray(test_labels, dtype='int32')

    # train_extra=np.asarray(all_extra[:train_size], dtype=theano.config.floatX)
    test_extra=np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size=len(test_extra)
    print ' test size: ', len(test_extra)

    vocab_size=len(word2id)+1


    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    # id2word = {y:x for x,y in word2id.iteritems()}
    # word2vec=load_word2vec()
    # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
    # store_model_to_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_embeddings', [init_embeddings])
    # exit(0)
    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    # labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1)
    word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1)


    '''create_AttentiveConv_params '''
    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]))
    conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para=[conv_W, conv_b,conv_W_context]

    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = word1_mask,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1)

    term_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_input_r,
             mask_matrix = word1_mask,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1)


    def_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = sents_mask_l,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1)


    def_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_input_r,
             mask_matrix = sents_mask_l,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1)

    if p_mode == 'conc':
        p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4*4*hidden_size[1]
    else:
        p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0)
        p_len =4*hidden_size[1]
    "form input to LR classifier"
    LR_input = T.concatenate([p,extra.dimshuffle(0,'x')],axis=1)
    LR_input_size=p_len+1

    U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a, LR_b]


    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector



    params = NN_para+LR_para #[init_embeddings]
    load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_embeddings', [init_embeddings])

    load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params)

    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... testing'


    n_test_batches=test_size/batch_size
    n_test_remain = test_size%batch_size
    if n_test_remain!=0:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]
    else:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)



    # max_acc_dev=0.0
    # max_ap_test=0.0
    # max_ap_topk_test=0.0
    # max_f1=0.0

    # cost_i=0.0
    # train_indices = range(train_size)


    for idd, test_batch_id in enumerate(test_batch_start): # for each test batch
        pred_i, prob_i=test_model(
                test_sents_l[test_batch_id:test_batch_id+batch_size],
                test_masks_l[test_batch_id:test_batch_id+batch_size],
                test_sents_r[test_batch_id:test_batch_id+batch_size],
                test_masks_r[test_batch_id:test_batch_id+batch_size],
                test_word1[test_batch_id:test_batch_id+batch_size],
                test_word2[test_batch_id:test_batch_id+batch_size],
                test_word1_mask[test_batch_id:test_batch_id+batch_size],
                test_word2_mask[test_batch_id:test_batch_id+batch_size],
                test_extra[test_batch_id:test_batch_id+batch_size])
        print pred_i, prob_i
def evaluate_lenet5(learning_rate=0.085, n_epochs=2000, nkerns=[1,1], batch_size=1, window_width=3,
                    maxSentLength=60, emb_size=300, L2_weight=0.0005, update_freq=1, unifiedWidth_conv0=8, k_dy=3, ktop=3):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_msr_corpus(rootPath+'vocab.txt', rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength)
    mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/'
    #mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size))
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    

    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    #mts=T.dmatrix()
    #wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv0=ishape[1]+filter_size[1]-1
    poolsize1=(1, length_after_wideConv0)
    length_after_wideConv1=unifiedWidth_conv0+filter_size[1]-1
    poolsize2=(1, length_after_wideConv1)
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_l, right=right_l, 
                        W=conv_W, b=conv_b,
                        firstLayer=True)
    layer0_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), poolsize=poolsize1, k=k_dy, unifiedWidth=unifiedWidth_conv0, left=left_r, right=right_r, 
                        W=conv_W, b=conv_b,
                        firstLayer=True)

    layer0_l_output=debug_print(layer0_ll.fold_output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_rr.fold_output, 'layer0_r.output')
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=ishape[0]/2,
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)

    conv_W2, conv_b2=create_conv_para(rng, filter_shape=(1, 1, filter_size[0]/2, filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer1_ll=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_ll.output,
            image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0),
            filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_ll.leftPad, right=layer0_ll.rightPad, 
                        W=conv_W2, b=conv_b2,
                        firstLayer=False)
    layer1_rr=Conv_Fold_DynamicK_PoolLayer_NAACL(rng, input=layer0_rr.output,
            image_shape=(batch_size, nkerns[0], ishape[0]/2, unifiedWidth_conv0),
            filter_shape=(nkerns[1], nkerns[0], filter_size[0]/2, filter_size[1]), poolsize=poolsize2, k=ktop, unifiedWidth=ktop, left=layer0_rr.leftPad, right=layer0_rr.rightPad, 
                        W=conv_W2, b=conv_b2,
                        firstLayer=False)

    layer1_l_output=debug_print(layer1_ll.fold_output, 'layer1_l.output')
    layer1_r_output=debug_print(layer1_rr.fold_output, 'layer1_r.output')
    
    layer2=Average_Pooling_for_Top(rng, input_l=layer1_l_output, input_r=layer1_r_output, kern=ishape[0]/4,
                                       left_l=layer0_ll.leftPad, right_l=layer0_ll.rightPad, left_r=layer0_rr.leftPad, right_r=layer0_rr.rightPad, 
                                       length_l=k_dy+filter_size[1]-1, length_r=k_dy+filter_size[1]-1,
                                       dim=unifiedWidth_conv0+filter_size[1]-1)    

    
    
    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    
    
    sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1=EUCLID(sum_uni_l, sum_uni_r)
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([#mts, 
                                eucli_1, uni_cosine,
                                #norm_uni_l, norm_uni_r,#uni_cosine,#norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                
                                layer1.output_eucli_to_simi,layer1.output_cosine,
                                layer1.output_attentions, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                #layer1.output_vector_l,layer1.output_vector_r,
                                
                                layer2.output_eucli_to_simi,layer2.output_cosine,
                                layer2.output_attentions,
                                #layer2.output_vector_l,layer2.output_vector_r,
                                
                                len_l, len_r
                                #layer1.output_attentions,
                                #wmf,
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=(2)+(2+4*4)+(2+4*4)+2, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum()+(conv_W2**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y), layer3.y_pred, layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index]
            #mts: mt_test[index: index + batch_size],
            #wmf: wm_test[index: index + batch_size]
            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ [conv_W]+[conv_W2]# + layer1.params 
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #grad_i=debug_print(grad_i,'grad_i')
        #norm=T.sqrt((grad_i**2).sum())
        #if T.lt(norm_threshold, norm):
        #    print 'big norm'
        #    grad_i=grad_i*(norm_threshold/norm)
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([index,cost_tmp], [cost,layer3.errors(y), layer3_input], updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index]
            #mts: mt_train[index: index + batch_size],
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index]
            #mts: mt_train[index: index + batch_size],
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'cost_ij: ', cost_ij
                cost_tmp+=cost_ij
                error_sum+=error_ij
            else:
                cost_average, error_ij, layer3_input= train_model(batch_start,cost_tmp)
                #print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' sum error: '+str(error_sum)+'/'+str(update_freq)
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print layer3_input
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, pred_y, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                
                test_score = numpy.mean(test_losses)
                test_acc=1-test_score
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           (1-test_score) * 100.))
                #now, see the results of svm
                #write_feature=open('feature_check.txt', 'w')
                train_y=[]
                train_features=[]
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n')
                #write_feature.close()

                clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                lr=linear_model.LogisticRegression().fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                corr_count=0
                corr_lr=0
                test_size=len(test_y)
                for i in range(test_size):
                    if results[i]==test_y[i]:
                        corr_count+=1
                    if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                        corr_lr+=1
                acc=corr_count*1.0/test_size
                acc_lr=corr_lr*1.0/test_size
                if acc > max_acc:
                    max_acc=acc
                    best_epoch=epoch
                if acc_lr> max_acc:
                    max_acc=acc_lr
                    best_epoch=epoch
                if test_acc> max_acc:
                    max_acc=test_acc
                    best_epoch=epoch
                print '\t\t\t\t\t\t\t\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ',    max_acc , ' at epoch: ', best_epoch     
                #exit(0)
            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.09, n_epochs=2000, nkerns=[50], batch_size=1, window_width=3,
                    maxSentLength=64, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.00065, Div_reg=0.01, update_freq=1, norm_threshold=5.0, max_truncate=33, max_truncate_nonoverlap=24):
    maxSentLength=max_truncate+2*(window_width-1)
    maxSentLength_nonoverlap=max_truncate_nonoverlap+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_SICK_corpus(rootPath+'vocab.txt', rootPath+'train_plus_dev.txt', rootPath+'test.txt', max_truncate,maxSentLength, entailment=True)#vocab_size contain train, dev and test
    datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]

    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      

    #nonoverlap
    indices_train_nonoverlap, trainY_nonoverlap, trainLengths_nonoverlap, normalized_train_length_nonoverlap, trainLeftPad_nonoverlap, trainRightPad_nonoverlap= datasets_nonoverlap[0]
    indices_train_l_nonoverlap=indices_train_nonoverlap[::2,:]
    indices_train_r_nonoverlap=indices_train_nonoverlap[1::2,:]
    trainLengths_l_nonoverlap=trainLengths_nonoverlap[::2]
    trainLengths_r_nonoverlap=trainLengths_nonoverlap[1::2]
    normalized_train_length_l_nonoverlap=normalized_train_length_nonoverlap[::2]
    normalized_train_length_r_nonoverlap=normalized_train_length_nonoverlap[1::2]

    trainLeftPad_l_nonoverlap=trainLeftPad_nonoverlap[::2]
    trainLeftPad_r_nonoverlap=trainLeftPad_nonoverlap[1::2]
    trainRightPad_l_nonoverlap=trainRightPad_nonoverlap[::2]
    trainRightPad_r_nonoverlap=trainRightPad_nonoverlap[1::2]    
    indices_test_nonoverlap, testY_nonoverlap, testLengths_nonoverlap,normalized_test_length_nonoverlap, testLeftPad_nonoverlap, testRightPad_nonoverlap= datasets_nonoverlap[1]

    indices_test_l_nonoverlap=indices_test_nonoverlap[::2,:]
    indices_test_r_nonoverlap=indices_test_nonoverlap[1::2,:]
    testLengths_l_nonoverlap=testLengths_nonoverlap[::2]
    testLengths_r_nonoverlap=testLengths_nonoverlap[1::2]
    normalized_test_length_l_nonoverlap=normalized_test_length_nonoverlap[::2]
    normalized_test_length_r_nonoverlap=normalized_test_length_nonoverlap[1::2]
    
    testLeftPad_l_nonoverlap=testLeftPad_nonoverlap[::2]
    testLeftPad_r_nonoverlap=testLeftPad_nonoverlap[1::2]
    testRightPad_l_nonoverlap=testRightPad_nonoverlap[::2]
    testRightPad_r_nonoverlap=testRightPad_nonoverlap[1::2]  
    '''
    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)
    '''
    
    indices_train_l_nonoverlap=theano.shared(numpy.asarray(indices_train_l_nonoverlap, dtype=theano.config.floatX), borrow=True)
    indices_train_r_nonoverlap=theano.shared(numpy.asarray(indices_train_r_nonoverlap, dtype=theano.config.floatX), borrow=True)
    indices_test_l_nonoverlap=theano.shared(numpy.asarray(indices_test_l_nonoverlap, dtype=theano.config.floatX), borrow=True)
    indices_test_r_nonoverlap=theano.shared(numpy.asarray(indices_test_r_nonoverlap, dtype=theano.config.floatX), borrow=True)
    indices_train_l_nonoverlap=T.cast(indices_train_l_nonoverlap, 'int64')
    indices_train_r_nonoverlap=T.cast(indices_train_r_nonoverlap, 'int64')
    indices_test_l_nonoverlap=T.cast(indices_test_l_nonoverlap, 'int64')
    indices_test_r_nonoverlap=T.cast(indices_test_r_nonoverlap, 'int64')
    


    rand_values_nonoverlap=random_value_normal((vocab_size_nonoverlap+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values_nonoverlap[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values_nonoverlap=load_word2vec_to_init(rand_values_nonoverlap, rootPath+'vocab_nonoverlap_train_plus_dev_in_word2vec_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings_nonoverlap=theano.shared(value=rand_values_nonoverlap, borrow=True)  
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_l_nonoverlap = T.lmatrix('x_index_l_nonoverlap')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    x_index_r_nonoverlap = T.lmatrix('x_index_r_nonoverlap')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()

    left_l_nonoverlap=T.lscalar()
    right_l_nonoverlap=T.lscalar()
    left_r_nonoverlap=T.lscalar()
    right_r_nonoverlap=T.lscalar()
    length_l_nonoverlap=T.lscalar()
    length_r_nonoverlap=T.lscalar()
    norm_length_l_nonoverlap=T.dscalar()
    norm_length_r_nonoverlap=T.dscalar()

    mts=T.dmatrix()
    extra=T.dmatrix()
    discri=T.dmatrix()
    #wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    ishape_nonoverlap = (emb_size, maxSentLength_nonoverlap)
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_l_input_nonoverlap = embeddings_nonoverlap[x_index_l_nonoverlap.flatten()].reshape((batch_size,maxSentLength_nonoverlap, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input_nonoverlap = embeddings_nonoverlap[x_index_r_nonoverlap.flatten()].reshape((batch_size,maxSentLength_nonoverlap, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))
    conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    
    layer0_l_nonoverlap = Conv_with_input_para(rng, input=layer0_l_input_nonoverlap,
            image_shape=(batch_size, 1, ishape_nonoverlap[0], ishape_nonoverlap[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r_nonoverlap = Conv_with_input_para(rng, input=layer0_r_input_nonoverlap,
            image_shape=(batch_size, 1, ishape_nonoverlap[0], ishape_nonoverlap[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output_nonoverlap=debug_print(layer0_l_nonoverlap.output, 'layer0_l_nonoverlap.output')
    layer0_r_output_nonoverlap=debug_print(layer0_r_nonoverlap.output, 'layer0_r_nonoverlap.output')
    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    
    layer1_nonoverlap=Average_Pooling_for_Top(rng, input_l=layer0_l_output_nonoverlap, input_r=layer0_r_output_nonoverlap, kern=nkerns[0],
                                       left_l=left_l_nonoverlap, right_l=right_l_nonoverlap, left_r=left_r_nonoverlap, right_r=right_r_nonoverlap, 
                                       length_l=length_l_nonoverlap+filter_size[1]-1, length_r=length_r_nonoverlap+filter_size[1]-1,
                                       dim=maxSentLength_nonoverlap+filter_size[1]-1)
    
    
    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    
    
    sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    
    linear=Linear(norm_uni_l, norm_uni_r)
    poly=Poly(norm_uni_l, norm_uni_r)
    sigmoid=Sigmoid(norm_uni_l, norm_uni_r)
    rbf=RBF(norm_uni_l, norm_uni_r)
    gesd=GESD(norm_uni_l, norm_uni_r)
    
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    
    sum_uni_l_nonoverlap=T.sum(layer0_l_input_nonoverlap, axis=3).reshape((1, emb_size))
    aver_uni_l_nonoverlap=sum_uni_l_nonoverlap/layer0_l_input_nonoverlap.shape[3]
    norm_uni_l_nonoverlap=sum_uni_l_nonoverlap/T.sqrt((sum_uni_l_nonoverlap**2).sum())
    sum_uni_r_nonoverlap=T.sum(layer0_r_input_nonoverlap, axis=3).reshape((1, emb_size))
    aver_uni_r_nonoverlap=sum_uni_r_nonoverlap/layer0_r_input_nonoverlap.shape[3]
    norm_uni_r_nonoverlap=sum_uni_r_nonoverlap/T.sqrt((sum_uni_r_nonoverlap**2).sum())
    
    uni_cosine_nonoverlap=cosine(sum_uni_l_nonoverlap, sum_uni_r_nonoverlap)
    aver_uni_cosine_nonoverlap=cosine(aver_uni_l_nonoverlap, aver_uni_r_nonoverlap)
    uni_sigmoid_simi_nonoverlap=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l_nonoverlap, norm_uni_r_nonoverlap.T)).reshape((1,1)),'uni_sigmoid_simi')    
    
    
    eucli_1_nonoverlap=1.0/(1.0+EUCLID(sum_uni_l_nonoverlap, sum_uni_r_nonoverlap))#25.2%
    #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    
    len_l_nonoverlap=norm_length_l_nonoverlap.reshape((1,1))
    len_r_nonoverlap=norm_length_r_nonoverlap.reshape((1,1))  
    
    '''
    len_l_nonoverlap=length_l_nonoverlap.reshape((1,1))
    len_r_nonoverlap=length_r_nonoverlap.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    
    layer3_input=T.concatenate([mts,
                                eucli_1,uni_cosine,#linear, poly,sigmoid,rbf, gesd, #sum_uni_r-sum_uni_l,
                                eucli_1_nonoverlap,uni_cosine_nonoverlap,
                                layer1.output_eucli_to_simi,layer1.output_cosine, #layer1.output_vector_r-layer1.output_vector_l,
                                layer1_nonoverlap.output_eucli_to_simi,layer1_nonoverlap.output_cosine,
                                len_l, len_r,
                                len_l_nonoverlap, len_r_nonoverlap,
                                extra
                                #discri
                                #wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=14+(2*2)+(2*2)+(2*2)+9, n_out=3)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg= Diversify_Reg(layer3.W.T)+Diversify_Reg(conv_W_into_matrix)
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg+Div_reg*diversify_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y),layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],

            x_index_l_nonoverlap: indices_test_l_nonoverlap[index: index + batch_size],
            x_index_r_nonoverlap: indices_test_r_nonoverlap[index: index + batch_size],
            left_l_nonoverlap: testLeftPad_l_nonoverlap[index],
            right_l_nonoverlap: testRightPad_l_nonoverlap[index],
            left_r_nonoverlap: testLeftPad_r_nonoverlap[index],
            right_r_nonoverlap: testRightPad_r_nonoverlap[index],
            length_l_nonoverlap: testLengths_l_nonoverlap[index],
            length_r_nonoverlap: testLengths_r_nonoverlap[index],
            norm_length_l_nonoverlap: normalized_test_length_l_nonoverlap[index],
            norm_length_r_nonoverlap: normalized_test_length_r_nonoverlap[index],

            mts: mt_test[index: index + batch_size],
            extra: extra_test[index: index + batch_size],
            discri:discri_test[index: index + batch_size]
            #wmf: wm_test[index: index + batch_size]
            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params 
    params_conv = [conv_W, conv_b]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
        
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
  
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    

#     def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
#         updates = []
#         grads = T.grad(cost, params)
#         i = theano.shared(numpy.float64(0.))
#         i_t = i + 1.
#         fix1 = 1. - (1. - b1)**i_t
#         fix2 = 1. - (1. - b2)**i_t
#         lr_t = lr * (T.sqrt(fix2) / fix1)
#         for p, g in zip(params, grads):
#             m = theano.shared(p.get_value() * 0.)
#             v = theano.shared(p.get_value() * 0.)
#             m_t = (b1 * g) + ((1. - b1) * m)
#             v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
#             g_t = m_t / (T.sqrt(v_t) + e)
#             p_t = p - (lr_t * g_t)
#             updates.append((m, m_t))
#             updates.append((v, v_t))
#             updates.append((p, p_t))
#         updates.append((i, i_t))
#         return updates
#      
#     updates=Adam(cost=cost, params=params, lr=0.0005)
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],

            x_index_l_nonoverlap: indices_train_l_nonoverlap[index: index + batch_size],
            x_index_r_nonoverlap: indices_train_r_nonoverlap[index: index + batch_size],
            left_l_nonoverlap: trainLeftPad_l_nonoverlap[index],
            right_l_nonoverlap: trainRightPad_l_nonoverlap[index],
            left_r_nonoverlap: trainLeftPad_r_nonoverlap[index],
            right_r_nonoverlap: trainRightPad_r_nonoverlap[index],
            length_l_nonoverlap: trainLengths_l_nonoverlap[index],
            length_r_nonoverlap: trainLengths_r_nonoverlap[index],
            norm_length_l_nonoverlap: normalized_train_length_l_nonoverlap[index],
            norm_length_r_nonoverlap: normalized_train_length_r_nonoverlap[index],
            
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],

            x_index_l_nonoverlap: indices_train_l_nonoverlap[index: index + batch_size],
            x_index_r_nonoverlap: indices_train_r_nonoverlap[index: index + batch_size],
            left_l_nonoverlap: trainLeftPad_l_nonoverlap[index],
            right_l_nonoverlap: trainRightPad_l_nonoverlap[index],
            left_r_nonoverlap: trainLeftPad_r_nonoverlap[index],
            right_r_nonoverlap: trainRightPad_r_nonoverlap[index],
            length_l_nonoverlap: trainLengths_l_nonoverlap[index],
            length_r_nonoverlap: trainLengths_r_nonoverlap[index],
            norm_length_l_nonoverlap: normalized_train_length_l_nonoverlap[index],
            norm_length_r_nonoverlap: normalized_train_length_r_nonoverlap[index],
            
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            #wmf: wm_train[index: index + batch_size]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    pre_max=-1
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_acc=1-test_score
                print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()
                clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                lr=linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                corr_count=0
                corr_lr=0
                corr_neu=0
                neu_co=0
                corr_ent=0
                ent_co=0
                corr_contr=0
                contr_co=0
                test_size=len(test_y)
                for i in range(test_size):
                    if results_lr[i]==test_y[i]:
                        corr_lr+=1
                    if test_y[i]==0:#NEUTRAL
                        neu_co+=1
                        if results[i]==test_y[i]:
                            corr_neu+=1
                    elif test_y[i]==1:#ENTAILMENT
                        ent_co+=1
                        if results[i]==test_y[i]:
                            corr_ent+=1
                    elif test_y[i]==2:#CONTRADICTION
                        contr_co+=1
                        if results[i]==test_y[i]:
                            corr_contr+=1

                        
                    #if numpy.absolute(results_lr[i]-test_y[i])<0.5:
                    #    corr_lr+=1
                corr_count=corr_neu+corr_ent+corr_contr
                acc=corr_count*1.0/test_size
                acc_neu=corr_neu*1.0/neu_co
                acc_ent=corr_ent*1.0/ent_co
                acc_contr=corr_contr*1.0/contr_co
                acc_lr=corr_lr*1.0/test_size
                if acc > max_acc:
                    max_acc=acc
                    best_epoch=epoch
                if test_acc > max_acc:
                    max_acc=test_acc
                    best_epoch=epoch                 
                if acc_lr> max_acc:
                    max_acc=acc_lr
                    best_epoch=epoch
                print '\t\t\tsvm:', acc, 'lr:', acc_lr, 'max:',    max_acc,'(at',best_epoch,')','Neu:',acc_neu, 'Ent:',acc_ent, 'Contr:',acc_contr 
                if max_acc > pre_max:
                    write_feature_train=open(rootPath+'train_feature_'+str(max_acc)+'.txt', 'w')
                    write_feature_test=open(rootPath+'test_feature_'+str(max_acc)+'.txt', 'w')
                    for i in range(len(train_features)):
                        write_feature_train.write(' '.join(map(str, train_features[i]))+'\n')
                    for i in range(len(test_features)):
                        write_feature_test.write(' '.join(map(str, test_features[i]))+'\n')
                    write_feature_train.close()
                    write_feature_test.close()
                    print 'features stored over'
                    pre_max=max_acc

            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #28
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=10,
                    L2_weight=0.000001,
                    extra_size=4,
                    emb_size=300,
                    posi_emb_size=50,
                    batch_size=50,
                    filter_size=[3, 3],
                    maxSentLen=50,
                    hidden_size=300):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SciTailV1_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)

    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    posi_rand_values = rng.normal(
        0.0, 0.01,
        (maxSentLen,
         posi_emb_size))  #generate a matrix by Gaussian distribution
    posi_embeddings = theano.shared(
        value=np.array(posi_rand_values, dtype=theano.config.floatX),
        borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size, 1, emb_size,
                                                    filter_size[0]))
    conv_W_posi, conv_b_posi = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1, emb_size + posi_emb_size,
                      filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size, 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_posi, conv_b_posi, conv_W_context]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size, 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size, 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_posi=conv_W_posi,
        b_posi=conv_b_posi,
        W_context=conv_W_context,
        b_context=conv_b_context,
        posi_emb_matrix=posi_embeddings,
        posi_emb_size=posi_emb_size)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    sent_embeddings_l = attentive_conv_layer.maxpool_vec_l
    sent_embeddings_r = attentive_conv_layer.maxpool_vec_r

    "form input to LR classifier"
    LR_input = T.concatenate([
        sent_embeddings_l, sent_embeddings_r,
        sent_embeddings_l * sent_embeddings_r, attentive_sent_embeddings_l,
        attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = 6 * hidden_size

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [init_embeddings, posi_embeddings] + NN_para + LR_para
    # L2_reg = (init_embeddings**2).sum()+(conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()

    cost = loss  #+L2_weight*L2_reg

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                dev_error_sum = 0.0
                for dev_batch_id in dev_batch_start:  # for each test batch
                    dev_error_i = dev_model(
                        dev_sents_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_sents_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_labels_store[dev_batch_id:dev_batch_id +
                                         batch_size])

                    dev_error_sum += dev_error_i
                dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start))

                if dev_acc > max_acc_dev:
                    max_acc_dev = dev_acc
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

                    error_sum = 0.0
                    for idd, test_batch_id in enumerate(
                            test_batch_start):  # for each test batch
                        error_i = test_model(
                            test_sents_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_sents_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_labels_store[test_batch_id:test_batch_id +
                                              batch_size])

                        error_sum += error_i
                    test_acc = 1.0 - error_sum / (len(test_batch_start))
                    if test_acc > max_acc_test:
                        max_acc_test = test_acc
                        store_model_to_file(
                            '/home/wenpeng/workspace/SciTail/src/model_para_' +
                            str(max_acc_test), params)
                    print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test
                else:
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.06,
                    n_epochs=2000,
                    nkerns=[50, 50],
                    batch_size=1,
                    window_width=[4, 4],
                    maxSentLength=64,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0006,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40):
    maxSentLength = max_truncate + 2 * (window_width[0] - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/'
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size = load_wikiQA_corpus(
        rootPath + 'vocab.txt', rootPath + 'WikiQA-train.txt',
        rootPath + 'test_filtered.txt', max_truncate,
        maxSentLength)  #vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test = load_mts_wikiQA(
        mtPath + 'result_train/concate_2mt_train.txt',
        mtPath + 'result_test/concate_2mt_test.txt')
    wm_train, wm_test = load_wmf_wikiQA(
        rootPath + 'train_word_matching_scores.txt',
        rootPath + 'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]
    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int64')
    indices_train_r = T.cast(indices_train_r, 'int64')
    indices_test_l = T.cast(indices_test_l, 'int64')
    indices_test_r = T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')
    left_l = T.lscalar()
    right_l = T.lscalar()
    left_r = T.lscalar()
    right_r = T.lscalar()
    length_l = T.lscalar()
    length_r = T.lscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    wmf = T.dmatrix()
    cost_tmp = T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width[0])
    filter_size_2 = (nkerns[0], window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_size[0],
                                                    filter_size[1]))
    load_model_from_file([conv_W, conv_b])
    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng,
                                    input=layer0_l_input,
                                    image_shape=(batch_size, 1, ishape[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[0], 1, filter_size[0],
                                                  filter_size[1]),
                                    W=conv_W,
                                    b=conv_b)
    layer0_r = Conv_with_input_para(rng,
                                    input=layer0_r_input,
                                    image_shape=(batch_size, 1, ishape[0],
                                                 ishape[1]),
                                    filter_shape=(nkerns[0], 1, filter_size[0],
                                                  filter_size[1]),
                                    W=conv_W,
                                    b=conv_b)
    layer0_l_output = debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output = debug_print(layer0_r.output, 'layer0_r.output')
    layer0_para = [conv_W, conv_b]

    layer1 = Average_Pooling(rng,
                             input_l=layer0_l_output,
                             input_r=layer0_r_output,
                             kern=nkerns[0],
                             left_l=left_l,
                             right_l=right_l,
                             left_r=left_r,
                             right_r=right_r,
                             length_l=length_l + filter_size[1] - 1,
                             length_r=length_r + filter_size[1] - 1,
                             dim=maxSentLength + filter_size[1] - 1,
                             window_size=window_width[0],
                             maxSentLength=maxSentLength)

    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1,
                                                      filter_size_2[0],
                                                      filter_size_2[1]))
    #load_model_from_file([conv2_W, conv2_b])
    layer2_l = Conv_with_input_para(
        rng,
        input=layer1.output_tensor_l,
        image_shape=(batch_size, 1, nkerns[0], ishape[1]),
        filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_r = Conv_with_input_para(
        rng,
        input=layer1.output_tensor_r,
        image_shape=(batch_size, 1, nkerns[0], ishape[1]),
        filter_shape=(nkerns[1], 1, filter_size_2[0], filter_size_2[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_para = [conv2_W, conv2_b]

    layer3 = Average_Pooling_for_Top(rng,
                                     input_l=layer2_l.output,
                                     input_r=layer2_r.output,
                                     kern=nkerns[1],
                                     left_l=left_l,
                                     right_l=right_l,
                                     left_r=left_r,
                                     right_r=right_r,
                                     length_l=length_l + filter_size_2[1] - 1,
                                     length_r=length_r + filter_size_2[1] - 1,
                                     dim=maxSentLength + filter_size_2[1] - 1)

    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)

    sum_uni_l = T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    aver_uni_l = sum_uni_l / layer0_l_input.shape[3]
    norm_uni_l = sum_uni_l / T.sqrt((sum_uni_l**2).sum())
    sum_uni_r = T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    aver_uni_r = sum_uni_r / layer0_r_input.shape[3]
    norm_uni_r = sum_uni_r / T.sqrt((sum_uni_r**2).sum())

    uni_cosine = cosine(sum_uni_l, sum_uni_r)
    aver_uni_cosine = cosine(aver_uni_l, aver_uni_r)
    uni_sigmoid_simi = debug_print(
        T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1, 1)),
        'uni_sigmoid_simi')
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1 = 1.0 / (1.0 + EUCLID(sum_uni_l, sum_uni_r))  #25.2%
    #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))

    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input = T.concatenate(
        [  #mts,
            uni_cosine,  #eucli_1_exp,#uni_sigmoid_simi,  #norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
            layer1.
            output_cosine,  #layer1.output_eucli_to_simi_exp,#layer1.output_sigmoid_simi,#layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
            layer3.output_cosine,
            len_l,
            len_r,
            wmf
        ],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3 = LogisticRegression(rng,
                                input=layer3_input,
                                n_in=(1) + (1) + (1) + 2 + 2,
                                n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer3.W**2).sum() + (conv2_W**2).sum(), 'L2_reg'
    )  #+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this = debug_print(layer3.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index], [layer3.prop_for_posi, layer3_input, y],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size],
            wmf: wm_test[index:index + batch_size]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params + layer2_para  #+layer0_para

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches / 5, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    svm_max = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs = []
                test_y = []
                test_features = []
                for i in test_batch_start:
                    prob_i, layer3_input, y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR = compute_map_mrr(rootPath + 'test_filtered.txt',
                                           test_probs)
                #now, check MAP and MRR
                print(
                    ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                     'model %f, MRR  %f') %
                    (epoch, minibatch_index, n_train_batches, MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm = clf.decision_function(test_features)
                MAP_svm, MRR_svm = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_svm)

                lr = LinearRegression().fit(train_features, train_y)
                results_lr = lr.predict(test_features)
                MAP_lr, MRR_lr = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        #after each epoch, increase the batch_size
        if epoch % 2 == 1:
            update_freq = update_freq * 1
        else:
            update_freq = update_freq / 1
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    nkerns=[90, 90],
                    batch_size=1,
                    window_width=2,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=57,
                    max_d_length=59,
                    margin=0.2):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/'
    rng = numpy.random.RandomState(23455)
    train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DSSSS(
        rootPath + 'vocab_DSSSS.txt', rootPath +
        'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt',
        rootPath + 'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt',
        max_s_length, maxSentLength,
        maxDocLength)  #vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    #     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    #     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    #     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

    # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label),
    #          numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4),
    #         numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4),
    #         numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)]
    # return results, line_control
    [
        train_data_D, train_data_A1, train_data_A2, train_data_A3,
        train_data_A4, train_Label, train_Length_D, train_Length_D_s,
        train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4,
        train_leftPad_D, train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2,
        train_leftPad_A3, train_leftPad_A4, train_rightPad_D,
        train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2,
        train_rightPad_A3, train_rightPad_A4
    ] = train_data
    [
        test_data_D, test_data_A1, test_data_A2, test_data_A3, test_data_A4,
        test_Label, test_Length_D, test_Length_D_s, test_Length_A1,
        test_Length_A2, test_Length_A3, test_Length_A4, test_leftPad_D,
        test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3,
        test_leftPad_A4, test_rightPad_D, test_rightPad_D_s, test_rightPad_A1,
        test_rightPad_A2, test_rightPad_A3, test_rightPad_A4
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    #     index_Q = T.lvector()
    index_A1 = T.lvector()
    index_A2 = T.lvector()
    index_A3 = T.lvector()
    index_A4 = T.lvector()
    #     y = T.lvector()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    #     len_Q=T.lscalar()
    len_A1 = T.lscalar()
    len_A2 = T.lscalar()
    len_A3 = T.lscalar()
    len_A4 = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    #     left_Q=T.lscalar()
    left_A1 = T.lscalar()
    left_A2 = T.lscalar()
    left_A3 = T.lscalar()
    left_A4 = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    #     right_Q=T.lscalar()
    right_A1 = T.lscalar()
    right_A2 = T.lscalar()
    right_A3 = T.lscalar()
    right_A4 = T.lscalar()

    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    #     layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A4_input = embeddings[index_A4.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    layer0_para = [conv_W, conv_b]
    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    layer2_para = [conv2_W, conv2_b]
    high_W, high_b = create_highw_para(rng, nkerns[0], nkerns[1])
    highW_para = [high_W, high_b]
    params = layer2_para + layer0_para + highW_para  #+[embeddings]
    #load_model(params)

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    #     layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A1 = Conv_with_input_para(
        rng,
        input=layer0_A1_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A2 = Conv_with_input_para(
        rng,
        input=layer0_A2_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A3 = Conv_with_input_para(
        rng,
        input=layer0_A3_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A4 = Conv_with_input_para(
        rng,
        input=layer0_A4_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    #     layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output')
    layer0_A2_output = debug_print(layer0_A2.output, 'layer0_A2.output')
    layer0_A3_output = debug_print(layer0_A3.output, 'layer0_A3.output')
    layer0_A4_output = debug_print(layer0_A4.output, 'layer0_A4.output')

    #     layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA1 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A1_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A1,
                                      right_r=right_A1,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A1 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)
    layer1_DA2 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A2_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A2,
                                      right_r=right_A2,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A2 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)
    layer1_DA3 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A3_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A3,
                                      right_r=right_A3,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A3 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)
    layer1_DA4 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A4_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A4,
                                      right_r=right_A4,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A4 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)

    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    #     layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA1 = Conv_with_input_para(
        rng,
        input=layer1_DA1.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA2 = Conv_with_input_para(
        rng,
        input=layer1_DA2.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA3 = Conv_with_input_para(
        rng,
        input=layer1_DA3.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA4 = Conv_with_input_para(
        rng,
        input=layer1_DA4.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    #     layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA1.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A2 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA2.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A3 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA3.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A4 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA4.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #     layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A1_output_sent_rep_Dlevel = debug_print(
        layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    layer2_A2_output_sent_rep_Dlevel = debug_print(
        layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    layer2_A3_output_sent_rep_Dlevel = debug_print(
        layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')
    layer2_A4_output_sent_rep_Dlevel = debug_print(
        layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel')

    #     layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA1 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA1.output,
        input_r=layer2_A1_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA2 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA2.output,
        input_r=layer2_A2_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA3 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA3.output,
        input_r=layer2_A3_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA4 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA4.output,
        input_r=layer2_A4_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)

    #high-way

    #     transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b),
        'transform_gate_DA1')
    transform_gate_DA2 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b),
        'transform_gate_DA2')
    transform_gate_DA3 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b),
        'transform_gate_DA3')
    transform_gate_DA4 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b),
        'transform_gate_DA4')
    #     transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q')
    transform_gate_A1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b),
        'transform_gate_A1')
    transform_gate_A2 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b),
        'transform_gate_A2')
    transform_gate_A3 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b),
        'transform_gate_A3')
    transform_gate_A4 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b),
        'transform_gate_A4')

    #     overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A1 = (
        1.0 - transform_gate_DA1
    ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep
    overall_D_A2 = (
        1.0 - transform_gate_DA2
    ) * layer1_DA2.output_D_sent_level_rep + transform_gate_DA2 * layer3_DA2.output_D_doc_level_rep
    overall_D_A3 = (
        1.0 - transform_gate_DA3
    ) * layer1_DA3.output_D_sent_level_rep + transform_gate_DA3 * layer3_DA3.output_D_doc_level_rep
    overall_D_A4 = (
        1.0 - transform_gate_DA4
    ) * layer1_DA4.output_D_sent_level_rep + transform_gate_DA4 * layer3_DA4.output_D_doc_level_rep

    #     overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A1 = (
        1.0 - transform_gate_A1
    ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel
    overall_A2 = (
        1.0 - transform_gate_A2
    ) * layer1_DA2.output_QA_sent_level_rep + transform_gate_A2 * layer2_A2.output_sent_rep_Dlevel
    overall_A3 = (
        1.0 - transform_gate_A3
    ) * layer1_DA3.output_QA_sent_level_rep + transform_gate_A3 * layer2_A3.output_sent_rep_Dlevel
    overall_A4 = (
        1.0 - transform_gate_A4
    ) * layer1_DA4.output_QA_sent_level_rep + transform_gate_A4 * layer2_A4.output_sent_rep_Dlevel

    simi_sent_level1 = debug_print(
        cosine(layer1_DA1.output_D_sent_level_rep,
               layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')
    simi_sent_level2 = debug_print(
        cosine(layer1_DA2.output_D_sent_level_rep,
               layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2')
    simi_sent_level3 = debug_print(
        cosine(layer1_DA3.output_D_sent_level_rep,
               layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3')
    simi_sent_level4 = debug_print(
        cosine(layer1_DA4.output_D_sent_level_rep,
               layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4')

    simi_doc_level1 = debug_print(
        cosine(layer3_DA1.output_D_doc_level_rep,
               layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')
    simi_doc_level2 = debug_print(
        cosine(layer3_DA2.output_D_doc_level_rep,
               layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2')
    simi_doc_level3 = debug_print(
        cosine(layer3_DA3.output_D_doc_level_rep,
               layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3')
    simi_doc_level4 = debug_print(
        cosine(layer3_DA4.output_D_doc_level_rep,
               layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4')

    simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1),
                                      'simi_overall_level1')
    simi_overall_level2 = debug_print(cosine(overall_D_A2, overall_A2),
                                      'simi_overall_level2')
    simi_overall_level3 = debug_print(cosine(overall_D_A3, overall_A3),
                                      'simi_overall_level3')
    simi_overall_level4 = debug_print(cosine(overall_D_A4, overall_A4),
                                      'simi_overall_level4')

    simi_1 = simi_overall_level1  #+simi_sent_level1+simi_doc_level1
    simi_2 = simi_overall_level2  #+simi_sent_level2+simi_doc_level2
    simi_3 = simi_overall_level3  #+simi_sent_level3+simi_doc_level3
    simi_4 = simi_overall_level4  #+simi_sent_level4+simi_doc_level4
    #     simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0
    #     simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0
    #     simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0
    #     simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0

    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    #     #only use overall_simi
    #     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    #     posi_simi=simi_overall_level1
    #     nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #use ensembled simi
    #     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
    #     cost=T.maximum(0.0, margin+simi_2-simi_1)+T.maximum(0.0, margin+simi_3-simi_1)+T.maximum(0.0, margin+simi_4-simi_1)
    cost12 = T.maximum(
        0.0, margin + simi_sent_level2 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level2 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level2 - simi_overall_level1)
    cost13 = T.maximum(
        0.0, margin + simi_sent_level3 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level3 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level3 - simi_overall_level1)
    cost14 = T.maximum(
        0.0, margin + simi_sent_level4 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level4 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level4 - simi_overall_level1)
    cost = cost12 + cost13 + cost14
    posi_simi = T.max([simi_sent_level1, simi_doc_level1, simi_overall_level1])
    nega_simi = T.max([
        simi_sent_level2, simi_doc_level2, simi_overall_level2,
        simi_sent_level3, simi_doc_level3, simi_overall_level3,
        simi_sent_level4, simi_doc_level4, simi_overall_level4
    ])

    L2_reg = debug_print(
        (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum(), 'L2_reg'
    )  #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost = debug_print(cost + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        givens={
            index_D: test_data_D[index],  #a matrix
            #             index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
            index_A4: test_data_A4[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            #             len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
            len_A4: test_Length_A4[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            #             left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
            left_A4: test_leftPad_A4[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            #             right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index],
            right_A4: test_rightPad_A4[index]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))


#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         acc = acc_i + T.sqr(grad_i)
#         if param_i == embeddings:
#             updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size)))))   #AdaGrad
#         else:
#             updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))

    train_model = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data

        corr_train = 0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            sys.stdout.write("Training :[%6f] %% complete!\r" %
                             ((iter % train_size) * 100.0 / train_size))
            sys.stdout.flush()
            minibatch_index = minibatch_index + 1

            cost_average, posi_simi, nega_simi = train_model(batch_start)
            if posi_simi > nega_simi:
                corr_train += 1

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + 'corr rate:' + str(
                            corr_train * 100.0 / train_size)

            if iter % validation_frequency == 0:
                corr_test = 0
                for i in test_batch_start:
                    cost, posi_simi, nega_simi = test_model(i)
                    if posi_simi > nega_simi:
                        corr_test += 1

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc = corr_test * 1.0 / test_size
                #test_acc=1-test_score
                print(
                    ('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') %
                    (epoch, minibatch_index, n_train_batches, test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                find_better = False
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')'
                if find_better == True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50], batch_size=1, window_width=4,
                    maxSentLength=64, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40):
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    mts=T.dmatrix()
    wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]))

    #layer0_output = debug_print(layer0.output, 'layer0.output')
    layer0_l = Conv_with_input_para(rng, input=layer0_l_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_r = Conv_with_input_para(rng, input=layer0_r_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b)
    layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output')
    layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output')
    

    
    layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0],
                                       left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, 
                                       length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1,
                                       dim=maxSentLength+filter_size[1]-1)
    

    
    
    #layer2=HiddenLayer(rng, input=layer1_out, n_in=nkerns[0]*2, n_out=hidden_size, activation=T.tanh)
    
    
    sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    
    uni_cosine=cosine(sum_uni_l, sum_uni_r)
    aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    '''
    linear=Linear(sum_uni_l, sum_uni_r)
    poly=Poly(sum_uni_l, sum_uni_r)
    sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    rbf=RBF(sum_uni_l, sum_uni_r)
    gesd=GESD(sum_uni_l, sum_uni_r)
    '''
    eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2%
    #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
    
    '''
    len_l=length_l.reshape((1,1))
    len_r=length_r.reshape((1,1))  
    '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([#mts,
                                uni_cosine,#eucli_1_exp,#uni_sigmoid_simi,  #norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, #
                                layer1.output_cosine,  #layer1.output_eucli_to_simi_exp,#layer1.output_sigmoid_simi,#layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, #
                                len_l, len_r,wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=(1)+(1)+2+2, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.prop_for_posi,layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index: index + batch_size],
            wmf: wm_test[index: index + batch_size]}, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ [conv_W, conv_b]#+[embeddings]# + layer1.params 
    params_conv = [conv_W, conv_b]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    prob_i, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR=compute_map_mrr(rootPath+'test_filtered.txt', test_probs)
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                           'model %f, MRR  %f') %
                          (epoch, minibatch_index, n_train_batches,MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm=clf.decision_function(test_features)
                MAP_svm, MRR_svm=compute_map_mrr(rootPath+'test_filtered.txt', results_svm)
                
                lr=LinearRegression().fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                MAP_lr, MRR_lr=compute_map_mrr(rootPath+'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        #after each epoch, increase the batch_size
        if epoch%2==1:
            update_freq=update_freq*1
        else:
            update_freq=update_freq/1
        
        #store the paras after epoch 15
        if epoch ==15:
            store_model_to_file(params_conv)
            print 'Finished storing best conv params'
            exit(0)
            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #32
0
def evaluate_lenet5(file_name,
                    vocab_file,
                    train_file,
                    dev_file,
                    word2vec_file,
                    learning_rate=0.001,
                    n_epochs=2000,
                    nkerns=[90, 90],
                    batch_size=1,
                    window_width=2,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=128,
                    max_d_length=128,
                    margin=0.3):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    f = open(file_name, 'w')
    f.write("model options " + str(model_options) + '\n')
    rng = numpy.random.RandomState(23455)
    train_data, _train_Label, train_size, test_data, _test_Label, test_size, vocab_size = load_MCTest_corpus_DPN(
        vocab_file, train_file, dev_file, max_s_length, maxSentLength,
        maxDocLength)  #vocab_size contain train, dev and test
    f.write('train_size : ' + str(train_size))

    [
        train_data_D, train_data_A1, train_Label, train_Length_D,
        train_Length_D_s, train_Length_A1, train_leftPad_D, train_leftPad_D_s,
        train_leftPad_A1, train_rightPad_D, train_rightPad_D_s,
        train_rightPad_A1
    ] = train_data
    [
        test_data_D, test_data_A1, test_Label, test_Length_D, test_Length_D_s,
        test_Length_A1, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_A1
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    rand_values = load_word2vec_to_init(rand_values, word2vec_file)
    embeddings = theano.shared(value=rand_values, borrow=True)

    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    index_A1 = T.lvector()
    y = T.lscalar()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    len_A1 = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    left_A1 = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    right_A1 = T.lscalar()

    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    f.write('... building the model\n')

    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    layer0_para = [conv_W, conv_b]
    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    layer2_para = [conv2_W, conv2_b]
    high_W, high_b = create_highw_para(
        rng, nkerns[0], nkerns[1]
    )  # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para = [high_W, high_b]
    params = layer2_para + layer0_para + highW_para  #+[embeddings]

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    layer0_A1 = Conv_with_input_para(
        rng,
        input=layer0_A1_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output')

    layer1_DA1 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A1_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A1,
                                      right_r=right_A1,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A1 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=3)

    layer2_DA1 = Conv_with_input_para(
        rng,
        input=layer1_DA1.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA1.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A1_output_sent_rep_Dlevel = debug_print(
        layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')

    layer3_DA1 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA1.output,
        input_r=layer2_A1_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)

    #high-way

    transform_gate_DA1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b),
        'transform_gate_DA1')
    transform_gate_A1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b),
        'transform_gate_A1')

    overall_D_A1 = (
        1.0 - transform_gate_DA1
    ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep
    overall_A1 = (
        1.0 - transform_gate_A1
    ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel

    simi_sent_level1 = debug_print(
        cosine(layer1_DA1.output_D_sent_level_rep,
               layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')

    simi_doc_level1 = debug_print(
        cosine(layer3_DA1.output_D_doc_level_rep,
               layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')

    simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1),
                                      'simi_overall_level1')

    simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0
    logistic_w, logistic_b = create_logistic_para(rng, 1, 2)
    logistic_para = [logistic_w, logistic_b]
    params += logistic_para
    simi_1 = T.dot(logistic_w, simi_1) + logistic_b.dimshuffle(0, 'x')
    simi_1 = simi_1.dimshuffle(1, 0)

    simi_1 = T.nnet.softmax(simi_1)
    predict = T.argmax(simi_1, axis=1)
    tmp = T.log(simi_1)
    cost = T.maximum(0.0, margin + tmp[0][1 - y] - tmp[0][y])
    L2_reg = (high_W**2).sum() + (conv2_W**2).sum() + (conv_W**2).sum() + (
        logistic_w**2).sum()
    cost = cost + L2_weight * L2_reg

    test_model = theano.function(
        [index],
        [cost, simi_1, predict],
        givens={
            index_D: test_data_D[index],  #a matrix
            index_A1: test_data_A1[index],
            y: test_Label[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_A1: test_Length_A1[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_A1: test_leftPad_A1[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_A1: test_rightPad_A1[index],
        },
        on_unused_input='ignore')

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index], [cost, simi_1, predict],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            index_A1: train_data_A1[index],
            y: train_Label[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_A1: train_Length_A1[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_A1: train_leftPad_A1[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_A1: train_rightPad_A1[index],
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    f.write('... training\n')
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data

        simi_train = []
        predict_train = []
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            minibatch_index = minibatch_index + 1

            cost_average, simi, predict = train_model(batch_start)
            simi_train.append(simi)
            predict_train.append(predict)
            if iter % 1000 == 0:
                f.write('@iter :' + str(iter) + '\n')
            if iter % n_train_batches == 0:
                corr_train = compute_corr_train(predict_train, _train_Label)
                res = 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + 'corr rate: ' + str(
                            corr_train * 100.0 / train_size) + '\n'
                f.write(res)

            if iter % validation_frequency == 0 or iter % 20000 == 0:
                posi_test_sent = []
                nega_test_sent = []
                posi_test_doc = []
                nega_test_doc = []
                posi_test_overall = []
                nega_test_overall = []

                simi_test = []
                predict_test = []
                for i in test_batch_start:
                    cost, simi, predict = test_model(i)
                    #print simi
                    #f.write('test_predict : ' + str(predict) + ' test_simi : ' + str(simi) + '\n' )
                    simi_test.append(simi)
                    predict_test.append(predict)
                corr_test = compute_corr(simi_test, predict_test, f)
                test_acc = corr_test * 1.0 / (test_size / 4.0)
                res = '\t\t\tepoch ' + str(epoch) + ', minibatch ' + str(
                    minibatch_index) + ' / ' + str(
                        n_train_batches) + ' test acc of best model ' + str(
                            test_acc * 100.0) + '\n'
                f.write(res)

                find_better = False
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                res = '\t\t\tmax: ' + str(max_acc) + ' (at ' + str(
                    best_epoch) + ')\n'
                f.write(res)
                if find_better == True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=0.0000001,
                    extra_size=4,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 5],
                    maxSentLen=60,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id = load_NYT_dataset(
        maxSentLen, word2id)

    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    hidden_size[0],
                                                    filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))

    conv_W_2, conv_b_2 = create_conv_para(rng,
                                          filter_shape=(hidden_size[1], 1,
                                                        hidden_size[0],
                                                        filter_size[1]))
    conv_W_context_2, conv_b_context_2 = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))

    NN_para = [
        conv_W, conv_b, conv_W_context, conv_W_2, conv_b_2, conv_W_context_2
    ]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
        image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen),
        filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    attentive_conv_layer_2 = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
        image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen),
        filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[1]),
        filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1),
        W=conv_W_2,
        b=conv_b_2,
        W_context=conv_W_context_2,
        b_context=conv_b_context_2)

    attentive_sent_embeddings_l_2 = attentive_conv_layer_2.attentive_maxpool_vec_l
    attentive_sent_embeddings_r_2 = attentive_conv_layer_2.attentive_maxpool_vec_r

    #form input to HL layers
    HL_layer_1_input = T.concatenate([
        attentive_sent_embeddings_l, attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r,
        attentive_sent_embeddings_l_2, attentive_sent_embeddings_r_2,
        attentive_sent_embeddings_l_2 * attentive_sent_embeddings_r_2
    ],
                                     axis=1)

    HL_layer_1_input_size = 6 * hidden_size[1]

    HL_layer_1 = HiddenLayer(rng,
                             input=HL_layer_1_input,
                             n_in=HL_layer_1_input_size,
                             n_out=hidden_size[1],
                             activation=T.nnet.relu)
    HL_layer_2 = HiddenLayer(rng,
                             input=HL_layer_1.output,
                             n_in=hidden_size[1],
                             n_out=hidden_size[1],
                             activation=T.nnet.relu)

    # LR_input_size=HL_layer_1_input_size+2*hidden_size[0]
    "form input to LR classifier"
    LR_input = T.tanh(
        T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],
                      axis=1))
    LR_input_size = HL_layer_1_input_size + 2 * hidden_size[1]

    U_a = create_ensemble_para(
        rng, 3, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''

    test_preds = T.argmax(layer_LR.p_y_given_x, axis=1)
    transfered_preds = T.eq(test_preds, 2)
    test_error = T.mean(T.neq(transfered_preds, labels))

    params = [init_embeddings
              ] + NN_para + HL_layer_1.params + HL_layer_2.params + LR_para

    cost = loss

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        [test_error, transfered_preds],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    error_i, pred_labels_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                    pred_labels += list(pred_labels_i)
                    gold_labels += list(
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])
                test_acc = 1.0 - error_sum / (len(test_batch_start))
                test_f1 = f1_score_2_binary_list(
                    gold_labels, pred_labels)  #, average='binary')

                if test_acc > max_acc_test:
                    max_acc_test = test_acc
                if test_f1 > max_f1:
                    max_f1 = test_f1

                    # store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_five_copies_'+str(max_acc_test), params)

                print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test, '\t\t test_f1:', test_f1, '\t\tmax F1:', max_f1

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Example #34
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=4,
                    emb_size=300,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=200,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_concMT_BBN_NI_epoch4.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_lines, word2id = load_official_testData_il_and_MT(
        word2id, maxSentLen, test_file_path)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    # test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + '100k-ENG-multicca.300.ENG.txt',
        emb_root + '100k-IL9-multicca.d300.IL9.txt'
    ], 300)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        [binarize_prob, ensemble_scores, sum_tensor3],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    # max_meanf1_test=0.0
    # max_weightf1_test=0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_types = []
                pred_confs = []
                pred_others = []
                for i, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    pred_types_i, pred_conf_i, pred_fields_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    if i < len(test_batch_start) - 1:
                        pred_types.append(pred_types_i)
                        pred_confs.append(pred_conf_i)
                        pred_others.append(pred_fields_i)
                    else:
                        pred_types.append(pred_types_i[-n_test_remain:])
                        pred_confs.append(pred_conf_i[-n_test_remain:])
                        pred_others.append(pred_fields_i[-n_test_remain:])
                pred_types = np.concatenate(pred_types, axis=0)
                pred_confs = np.concatenate(pred_confs, axis=0)
                pred_others = np.concatenate(pred_others, axis=0)
                mean_frame = generate_2018_official_output(
                    test_lines, output_file_path, pred_types, pred_confs,
                    pred_others, min_mean_frame)
                if mean_frame < min_mean_frame:
                    min_mean_frame = mean_frame
                print '\t\t\t test  over, min_mean_frame:', min_mean_frame

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.09, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=3,
                    maxSentLength=64, maxDocLength=60, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.00065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59):
    maxSentLength=max_s_length+2*(window_width-1)
    maxDocLength=max_d_length+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/';
    rng = numpy.random.RandomState(23455)
    train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus(rootPath+'vocab.txt', rootPath+'mc500.train.tsv_standardlized.txt', rootPath+'mc500.test.tsv_standardlized.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
#     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
#     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
#     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    [train_data_D, train_data_Q, train_data_A, train_Y, train_Label, 
                 train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A,
                train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A,
                train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data
    [test_data_D, test_data_Q, test_data_A, test_Y, test_Label, 
                 test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A,
                test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A,
                test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data                


    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int64')
#     indices_train_r=T.cast(indices_train_r, 'int64')
#     indices_test_l=T.cast(indices_test_l, 'int64')
#     indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()   # now, x is the index matrix, must be integer
    index_Q = T.lvector()
    index_A= T.lvector()
    y = T.lvector()  
    
    len_D=T.lscalar()
    len_D_s=T.lvector()
    len_Q=T.lscalar()
    len_A=T.lscalar()

    left_D=T.lscalar()
    left_D_s=T.lvector()
    left_Q=T.lscalar()
    left_A=T.lscalar()

    right_D=T.lscalar()
    right_D_s=T.lvector()
    right_Q=T.lscalar()
    right_A=T.lscalar()
        

    #wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength) # doc shape
    filter_words=(emb_size,window_width)
    filter_sents=(nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A_input = embeddings[index_A.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
        
    conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]))
#     load_model_for_conv1([conv_W, conv_b])

    layer0_D = Conv_with_input_para(rng, input=layer0_D_input,
            image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A = Conv_with_input_para(rng, input=layer0_A_input,
            image_shape=(batch_size, 1, ishape[0], ishape[1]),
            filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    
    layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output')
    layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A_output=debug_print(layer0_A.output, 'layer0_A.output')
    layer0_para=[conv_W, conv_b]    

    layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                     left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q, 
                      length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                     left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A, right_r=right_A, 
                      length_D_s=len_D_s+filter_words[1]-1, length_r=len_A+filter_words[1]-1,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    
    conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA = Conv_with_input_para(rng, input=layer1_DA.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A_output_sent_rep_Dlevel=debug_print(layer2_A.output_sent_rep_Dlevel, 'layer2_A.output_sent_rep_Dlevel')
    layer2_para=[conv2_W, conv2_b]
    
    layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA=Average_Pooling_for_Top(rng, input_l=layer2_DA.output, input_r=layer2_A_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0, 
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    
    #high-way
    high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1])
    transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA.output_D_sent_level_rep) + high_b), 'transform_gate_DA')
    transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q')
    transform_gate_A=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA.output_QA_sent_level_rep) + high_b), 'transform_gate_A')
    highW_para=[high_W, high_b]
        
    overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A=(1.0-transform_gate_DA)*layer1_DA.output_D_sent_level_rep+transform_gate_DA*layer3_DA.output_D_doc_level_rep
    overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A=(1.0-transform_gate_A)*layer1_DA.output_QA_sent_level_rep+transform_gate_A*layer2_A.output_sent_rep_Dlevel
    
    simi_sent_level=debug_print(cosine(layer1_DQ.output_D_sent_level_rep+layer1_DA.output_D_sent_level_rep, layer1_DQ.output_QA_sent_level_rep+layer1_DA.output_QA_sent_level_rep), 'simi_sent_level')
    simi_doc_level=debug_print(cosine(layer3_DQ.output_D_doc_level_rep+layer3_DA.output_D_doc_level_rep, layer2_Q.output_sent_rep_Dlevel+layer2_A.output_sent_rep_Dlevel), 'simi_doc_level')
    simi_overall_level=debug_print(cosine(overall_D_Q+overall_D_A, overall_Q+overall_A), 'simi_overall_level')
    

#     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))
 
    

    
        

    layer4_input=debug_print(T.concatenate([simi_sent_level,
                                simi_doc_level,
                                simi_overall_level
                                ], axis=1), 'layer4_input')#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer4=LogisticRegression(rng, input=layer4_input, n_in=3, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer4.W** 2).sum()+(high_W**2).sum()+(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this =debug_print(layer4.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    
# 
#     [train_data_D, train_data_Q, train_data_A, train_Y, train_Label, 
#                  train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A,
#                 train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A,
#                 train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A]=train_data
#     [test_data_D, test_data_Q, test_data_A, test_Y, test_Label, 
#                  test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A,
#                 test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A,
#                 test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A]=test_data  
#     index = T.lscalar()
#     index_D = T.lmatrix()   # now, x is the index matrix, must be integer
#     index_Q = T.lvector()
#     index_A= T.lvector()
#     
#     y = T.lvector()  
#     len_D=T.lscalar()
#     len_D_s=T.lvector()
#     len_Q=T.lscalar()
#     len_A=T.lscalar()
# 
#     left_D=T.lscalar()
#     left_D_s=T.lvector()
#     left_Q=T.lscalar()
#     left_A=T.lscalar()
# 
#     right_D=T.lscalar()
#     right_D_s=T.lvector()
#     right_Q=T.lscalar()
#     right_A=T.lscalar()
#         
# 
#     #wmf=T.dmatrix()
#     cost_tmp=T.dscalar()
    
    test_model = theano.function([index], [layer4.errors(y),layer4_input, y, layer4.prop_for_posi],
          givens={
            index_D: test_data_D[index], #a matrix
            index_Q: test_data_Q[index],
            index_A: test_data_A[index],
            y: test_Y[index:index+batch_size],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_Q: test_Length_Q[index],
            len_A: test_Length_A[index],

            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_Q: test_leftPad_Q[index],
            left_A: test_leftPad_A[index],
        
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_Q: test_rightPad_Q[index],
            right_A: test_rightPad_A[index]
            
            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer4.params+layer2_para+layer0_para+highW_para
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
 

  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A: train_data_A[index],
            y: train_Y[index:index+batch_size],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A: train_Length_A[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A: train_leftPad_A[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A: train_rightPad_A[index]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer4.errors(y), layer4_input, y],
          givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A: train_data_A[index],
            y: train_Y[index:index+batch_size],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A: train_Length_A[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A: train_leftPad_A[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A: train_rightPad_A[index]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
#         readfile=open('/mounts/data/proj/wenpeng/Dataset/SICK/train_plus_dev.txt', 'r')
#         train_pairs=[]
#         train_y=[]
#         for line in readfile:
#             tokens=line.strip().split('\t')
#             listt=tokens[0]+'\t'+tokens[1]
#             train_pairs.append(listt)
#             train_y.append(tokens[2])
#         readfile.close()
#         writefile=open('/mounts/data/proj/wenpeng/Dataset/SICK/weights_fine_tune.txt', 'w')
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
            sys.stdout.write( "Training :[%6f] %% complete!\r" % (batch_start*100.0/train_size) )
            sys.stdout.flush()
            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij

  
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)

            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                test_prop=[]
                for i in test_batch_start:
                    test_loss, layer3_input, y, posi_prop=test_model(i)
                    test_prop.append(posi_prop[0][0])
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc=compute_test_acc(test_y, test_prop)
                #test_acc=1-test_score
                print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1
 
                #write_feature.close()
                clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33
                clf.fit(train_features, train_y)
                results=clf.decision_function(test_features)
                lr=linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr=lr.decision_function(test_features)
                 
                acc_svm=compute_test_acc(test_y, results)
                acc_lr=compute_test_acc(test_y, results_lr)
 
                find_better=False
                if acc_svm > max_acc:
                    max_acc=acc_svm
                    best_epoch=epoch
                    find_better=True
                if test_acc > max_acc:
                    max_acc=test_acc
                    best_epoch=epoch    
                    find_better=True             
                if acc_lr> max_acc:
                    max_acc=acc_lr
                    best_epoch=epoch
                    find_better=True
                print '\t\t\tsvm:', acc_svm, 'lr:', acc_lr, 'nn:', test_acc, 'max:',    max_acc,'(at',best_epoch,')'
#                 if find_better==True:
#                     store_model_to_file(layer2_para, best_epoch)
#                     print 'Finished storing best conv params'  

            if patience <= iter:
                done_looping = True
                break
        
        
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
        #writefile.close()
   
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #36
0
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=100, filter_size=[3,3], maxSentLen=40, hidden_size=[300,300], max_term_len=4, p_mode = 'conc'):

    model_options = locals().copy()
    print "model options", model_options

    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results


    # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id  =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len)  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    word2id = load_word2id(root_dic+'LenciBenotto_word2id.pkl')
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id, group_size_list = load_task_hyper_vs_all_with_allDefComb(LenciBenotto_file,maxSentLen, word2id, wordlen=max_term_len)



    test_sents_l=np.asarray(test_sents_l, dtype='int32')

    test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX)


    test_sents_r=np.asarray(test_sents_r, dtype='int32')


    test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX)


    test_word1=np.asarray(test_word1, dtype='int32')
    test_word2=np.asarray(test_word2, dtype='int32')


    test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX)


    test_labels_store=np.asarray(test_labels, dtype='int32')

    test_extra=np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size=len(test_sents_l)
    print ' test size: ', test_size

    vocab_size=len(word2id)+1


    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
    # load_model_from_file(root_dic+'Weeds_best_para_init_embeddings', [init_embeddings])

    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1)
    word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1)


    '''create_AttentiveConv_params '''
    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]))
    conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para=[conv_W, conv_b,conv_W_context]

    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = word1_mask,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1)

    term_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_input_r,
             mask_matrix = word1_mask,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1)


    def_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = sents_mask_l,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1)


    def_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_input_r,
             mask_matrix = sents_mask_l,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1)

    if p_mode == 'conc':
        p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4*4*hidden_size[1]
    else:
        p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0)
        p_len =4*hidden_size[1]

    # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    # HL_input_size=p_len+1+1
    #
    # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh)


    "form input to LR classifier"
    LR_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    LR_input_size=p_len+1+1
    # LR_input = HL_layer_1.output
    # LR_input_size = hidden_size[1]

    U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a, LR_b]


    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b, bias=0.25) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.


    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()





    params = NN_para+LR_para #[init_embeddings]


    # load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params)
    load_model_from_file(root_dic+'LenciBenotto_best_para_0.557286573332', params)

    '''
    0.552587544259; current ap: 0.574037513126 ap@100 0.918481316424
    0.557286573332; current ap: 0.576498645289 ap@100 0.909032657538
    '''

    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    n_test_batches=test_size/batch_size
    n_test_remain = test_size%batch_size
    if n_test_remain!=0:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]
    else:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)



    # max_acc_dev=0.0
    max_ap_test=0.0
    max_ap_topk_test=0.0
    max_f1=0.0


    pred_labels =[]
    probs = []
    gold_labels =[]
    error_sum=0.0
    for idd, test_batch_id in enumerate(test_batch_start): # for each test batch
        pred_i, prob_i=test_model(
                test_sents_l[test_batch_id:test_batch_id+batch_size],
                test_masks_l[test_batch_id:test_batch_id+batch_size],
                test_sents_r[test_batch_id:test_batch_id+batch_size],
                test_masks_r[test_batch_id:test_batch_id+batch_size],
                test_word1[test_batch_id:test_batch_id+batch_size],
                test_word2[test_batch_id:test_batch_id+batch_size],
                test_word1_mask[test_batch_id:test_batch_id+batch_size],
                test_word2_mask[test_batch_id:test_batch_id+batch_size],
                test_extra[test_batch_id:test_batch_id+batch_size])

        # error_sum+=error_i
        pred_labels+=list(pred_i)
        probs+=list(prob_i)

    print len(test_sents_l), len(probs)
    if n_test_remain !=0:
        probs = probs[:(len(test_batch_start)-1)*batch_size]+probs[-n_test_remain:]
    print len(test_sents_l), len(probs)
    assert len(test_sents_l) == len(probs)
    assert sum(group_size_list) == len(probs)
    #max prob in group
    max_probs = []
    prior_size = 0
    for i in range(len(group_size_list)):

        sub_probs = probs[prior_size:prior_size+group_size_list[i]]
        prior_size += group_size_list[i]
        max_probs.append(max(sub_probs))

    print len(group_size_list),len(max_probs),len(test_labels)
    assert len(test_labels) == len(max_probs)
    # test_acc=1.0-error_sum/(len(test_batch_start))
    test_ap = apk(test_labels, max_probs, k=len(test_labels))
    test_ap_top100 = apk(test_labels, max_probs, k=100)


    # if test_ap > max_ap_test:
    #     max_ap_test=test_ap
    #     store_model_to_file('/save/wenpeng/datasets/EVALution/HyperDef_label_4ways_conc_test_on_EVA_allDefComb_best_para_'+str(max_ap_test), params)
    # if test_ap_top100 > max_ap_topk_test:
    #     max_ap_topk_test=test_ap_top100
    print '\t\tcurrent ap:', test_ap,'ap@100', test_ap_top100
Example #37
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=2000,
                    batch_size=100,
                    emb_size=300,
                    char_emb_size=20,
                    hidden_size=300,
                    L2_weight=0.0001,
                    p_len_limit=400,
                    test_p_len_limit=100,
                    q_len_limit=20,
                    char_len=15,
                    filter_size=[5, 5],
                    char_filter_size=3,
                    margin=2.0,
                    max_EM=50.302743615):
    test_batch_size = batch_size * 10
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)

    word2id = {}
    char2id = {}
    #questions,paragraphs,q_masks,p_masks,labels, word2id
    train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id = load_squad_cnn_rank_word_train(
        word2id, char2id, p_len_limit, q_len_limit, char_len)
    train_size = len(train_para_list)

    test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_word_dev(
        word2id, char2id, test_p_len_limit, q_len_limit, char_len)
    test_size = len(test_para_list)

    train_Q_list = numpy.asarray(train_Q_list, dtype='int32')
    train_para_list = numpy.asarray(train_para_list, dtype='int32')
    train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX)
    train_para_mask = numpy.asarray(train_para_mask,
                                    dtype=theano.config.floatX)

    train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32')
    train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32')
    train_Q_char_mask = numpy.asarray(train_Q_char_mask,
                                      dtype=theano.config.floatX)
    train_para_char_mask = numpy.asarray(train_para_char_mask,
                                         dtype=theano.config.floatX)

    train_label_list = numpy.asarray(train_label_list, dtype='int32')

    test_Q_list = numpy.asarray(test_Q_list, dtype='int32')
    test_para_list = numpy.asarray(test_para_list, dtype='int32')
    test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX)
    test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX)

    test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32')
    test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32')
    test_Q_char_mask = numpy.asarray(test_Q_char_mask,
                                     dtype=theano.config.floatX)
    test_para_char_mask = numpy.asarray(test_para_char_mask,
                                        dtype=theano.config.floatX)

    vocab_size = len(word2id)
    print 'vocab size: ', vocab_size
    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX, rng)
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_glove()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    char_size = len(char2id)
    print 'char size: ', char_size
    char_rand_values = random_value_normal((char_size + 1, char_emb_size),
                                           theano.config.floatX, rng)
    char_embeddings = theano.shared(value=char_rand_values, borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    gold_indices = T.imatrix()  #batch, (start, end) for each sample
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')

    char_paragraph = T.imatrix()  #(batch, char_len*p_len)
    char_questions = T.imatrix()
    char_para_mask = T.fmatrix()
    char_q_mask = T.fmatrix()

    true_p_len = T.iscalar()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size = paragraph.shape[0]

    common_input_p = embeddings[paragraph.flatten()].reshape(
        (true_batch_size, true_p_len,
         emb_size))  #the input format can be adapted into CNN or GRU or LSTM
    common_input_q = embeddings[questions.flatten()].reshape(
        (true_batch_size, q_len_limit, emb_size))

    char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape(
        (true_batch_size * true_p_len, char_len, char_emb_size
         ))  #the input format can be adapted into CNN or GRU or LSTM
    char_common_input_q = char_embeddings[char_questions.flatten()].reshape(
        (true_batch_size * q_len_limit, char_len, char_emb_size))

    char_p_masks = char_para_mask.reshape(
        (true_batch_size * true_p_len, char_len))
    char_q_masks = char_q_mask.reshape(
        (true_batch_size * q_len_limit, char_len))

    conv_W_char, conv_b_char = create_conv_para(
        rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size))
    conv_W_1, conv_b_1 = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1, emb_size + char_emb_size,
                      filter_size[0]))
    conv_W_2, conv_b_2 = create_conv_para(rng,
                                          filter_shape=(hidden_size, 1,
                                                        hidden_size,
                                                        filter_size[1]))

    conv_W_1_q, conv_b_1_q = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1, emb_size + char_emb_size,
                      filter_size[0]))
    conv_W_2_q, conv_b_2_q = create_conv_para(rng,
                                              filter_shape=(hidden_size, 1,
                                                            hidden_size,
                                                            filter_size[1]))
    NN_para = [
        conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q,
        conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char
    ]

    input4score = squad_cnn_rank_word(
        rng, common_input_p, common_input_q, char_common_input_p,
        char_common_input_q, batch_size, p_len_limit, q_len_limit, emb_size,
        char_emb_size, char_len, filter_size, char_filter_size, hidden_size,
        conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q,
        conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, para_mask, q_mask,
        char_p_masks, char_q_masks)  #(batch, 4*hidden, p_len_limit)

    test_input4score = squad_cnn_rank_word(
        rng, common_input_p, common_input_q, char_common_input_p,
        char_common_input_q, test_batch_size, test_p_len_limit, q_len_limit,
        emb_size, char_emb_size, char_len, filter_size, char_filter_size,
        hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q,
        conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char,
        para_mask, q_mask, char_p_masks,
        char_q_masks)  #(batch, 4*hidden, p_len_limit)

    # gram_size = 5*true_p_len-(0+1+2+3+4)

    HL_1_para = create_ensemble_para(rng, hidden_size,
                                     6 * hidden_size + char_emb_size)
    HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 1, hidden_size)
    norm_U_a = normalize_matrix(U_a)
    norm_HL_1_para = normalize_matrix(HL_1_para)
    norm_HL_2_para = normalize_matrix(HL_2_para)
    norm_HL_3_para = normalize_matrix(HL_3_para)
    norm_HL_4_para = normalize_matrix(HL_4_para)

    end_HL_1_para = create_ensemble_para(rng, hidden_size,
                                         6 * hidden_size + char_emb_size)
    end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_U_a = create_ensemble_para(rng, 1, hidden_size)
    end_norm_U_a = normalize_matrix(end_U_a)
    end_norm_HL_1_para = normalize_matrix(end_HL_1_para)
    end_norm_HL_2_para = normalize_matrix(end_HL_2_para)
    end_norm_HL_3_para = normalize_matrix(end_HL_3_para)
    end_norm_HL_4_para = normalize_matrix(end_HL_4_para)

    span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,
                                           norm_HL_2_para, norm_HL_3_para,
                                           norm_HL_4_para, norm_U_a,
                                           batch_size, true_p_len)
    span_scores = T.nnet.softmax(span_scores_matrix)  #(batch, para_len)
    end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para,
                                               end_norm_HL_2_para,
                                               end_norm_HL_3_para,
                                               end_norm_HL_4_para,
                                               end_norm_U_a, batch_size,
                                               true_p_len)
    end_span_scores = T.nnet.softmax(
        end_span_scores_matrix)  #(batch, para_len)
    loss_neg_likelihood = -T.mean(
        T.log(span_scores[T.arange(batch_size), gold_indices[:, 0]]))
    end_loss_neg_likelihood = -T.mean(
        T.log(span_scores[T.arange(batch_size), gold_indices[:, 1]]))

    #ranking loss start
    tanh_span_scores_matrix = span_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)
    index_matrix = T.zeros((batch_size, p_len_limit),
                           dtype=theano.config.floatX)
    new_index_matrix = T.set_subtensor(
        index_matrix[T.arange(batch_size), gold_indices[:, 0]], 1.0)
    prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()]
    prob_batch_nega = tanh_span_scores_matrix[(1.0 -
                                               new_index_matrix).nonzero()]
    repeat_posi = T.extra_ops.repeat(prob_batch_posi,
                                     prob_batch_nega.shape[0],
                                     axis=0)
    repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0),
                                     prob_batch_posi.shape[0],
                                     axis=0).flatten()
    loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega))

    #ranking loss END
    end_tanh_span_scores_matrix = end_span_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)
    end_index_matrix = T.zeros((batch_size, p_len_limit),
                               dtype=theano.config.floatX)
    end_new_index_matrix = T.set_subtensor(
        end_index_matrix[T.arange(batch_size), gold_indices[:, 1]], 1.0)
    end_prob_batch_posi = end_tanh_span_scores_matrix[
        end_new_index_matrix.nonzero()]
    end_prob_batch_nega = end_tanh_span_scores_matrix[(
        1.0 - end_new_index_matrix).nonzero()]
    end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi,
                                         end_prob_batch_nega.shape[0],
                                         axis=0)
    end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle(
        'x', 0),
                                         end_prob_batch_posi.shape[0],
                                         axis=0).flatten()
    end_loss_rank = T.mean(
        T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega))

    loss = loss_neg_likelihood + end_loss_neg_likelihood + loss_rank + end_loss_rank

    #test
    test_span_scores_matrix = add_HLs_2_tensor3(
        test_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para,
        norm_HL_4_para, norm_U_a, true_batch_size,
        true_p_len)  #(batch, test_p_len)
    mask_test_return = T.argmax(test_span_scores_matrix * para_mask,
                                axis=1)  #batch

    end_test_span_scores_matrix = add_HLs_2_tensor3(
        test_input4score, end_norm_HL_1_para, end_norm_HL_2_para,
        end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size,
        true_p_len)  #(batch, test_p_len)
    end_mask_test_return = T.argmax(end_test_span_scores_matrix * para_mask,
                                    axis=1)  #batch

    params = (
        [embeddings, char_embeddings] + NN_para +
        [U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] +
        [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para])

    L2_reg = L2norm_paraList([
        embeddings, char_embeddings, conv_W_1, conv_W_2, conv_W_1_q,
        conv_W_2_q, conv_W_char, U_a, HL_1_para, HL_2_para, HL_3_para,
        HL_4_para
    ])
    #L2_reg = L2norm_paraList(params)
    cost = loss + L2_weight * L2_reg

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function(
        [
            paragraph,
            questions,
            gold_indices,
            para_mask,
            q_mask,
            char_paragraph,  #(batch, char_len*p_len)
            char_questions,
            char_para_mask,
            char_q_mask,
            true_p_len
        ],
        cost,
        updates=updates,
        on_unused_input='ignore')

    test_model = theano.function([
        paragraph, questions, para_mask, q_mask, char_paragraph,
        char_questions, char_para_mask, char_q_mask, true_p_len
    ], [mask_test_return, end_mask_test_return],
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / test_batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(numpy.arange(n_test_batches) *
                            test_batch_size) + [test_size - test_batch_size]

    max_F1_acc = 0.0
    max_exact_acc = 0.0
    cost_i = 0.0
    train_ids = range(train_size)

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.Random(4).shuffle(train_ids)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_ids[para_id:para_id + batch_size]
            cost_i += train_model(
                train_para_list[train_id_batch], train_Q_list[train_id_batch],
                train_label_list[train_id_batch],
                train_para_mask[train_id_batch], train_Q_mask[train_id_batch],
                train_para_char_list[train_id_batch],
                train_Q_char_list[train_id_batch],
                train_para_char_mask[train_id_batch],
                train_Q_char_mask[train_id_batch], p_len_limit)

            #print iter
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                pred_dict = {}
                q_amount = 0
                p1 = 0
                for test_para_id in test_batch_start:
                    batch_predict_ids, batch_predict_end_ids = test_model(
                        test_para_list[test_para_id:test_para_id +
                                       test_batch_size],
                        test_Q_list[test_para_id:test_para_id +
                                    test_batch_size],
                        test_para_mask[test_para_id:test_para_id +
                                       test_batch_size],
                        test_Q_mask[test_para_id:test_para_id +
                                    test_batch_size],
                        test_para_char_list[test_para_id:test_para_id +
                                            test_batch_size],
                        test_Q_char_list[test_para_id:test_para_id +
                                         test_batch_size],
                        test_para_char_mask[test_para_id:test_para_id +
                                            test_batch_size],
                        test_Q_char_mask[test_para_id:test_para_id +
                                         test_batch_size], test_p_len_limit)
                    test_para_wordlist_batch = test_para_wordlist_list[
                        test_para_id:test_para_id + test_batch_size]
                    #                     test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size]
                    #                     q_amount+=test_batch_size
                    q_ids_batch = q_idlist[test_para_id:test_para_id +
                                           test_batch_size]
                    q_amount += test_batch_size

                    for q in range(test_batch_size):  #for each question
                        #                         pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q])

                        start = batch_predict_ids[q]
                        end = batch_predict_end_ids[q]
                        if end < start:
                            start, end = end, start
                        pred_ans = ' '.join(
                            test_para_wordlist_batch[q][start:end + 1])
                        q_id = q_ids_batch[q]
                        pred_dict[q_id] = pred_ans
                with codecs.open(rootPath + 'predictions.txt', 'w',
                                 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json',
                                                  rootPath + 'predictions.txt')
                if F1_acc > max_F1_acc:
                    max_F1_acc = F1_acc
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc


#                     if max_exact_acc > max_EM:
#                         store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params)
#                         print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=2000,
                    word_nkerns=500,
                    char_nkerns=100,
                    batch_size=1,
                    window_width=3,
                    emb_size=500,
                    char_emb_size=100,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0003,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40,
                    max_char_len=40,
                    max_des_len=20,
                    max_relation_len=5,
                    max_Q_len=30,
                    train_neg_size=6,
                    neg_all=100,
                    train_size=75893,
                    test_size=19168,
                    mark='_BiasedMaxPool_lr0.1_word500_char100_noDes_ent2.0'
                    ):  #train_size=75909, test_size=17386
    #     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files = [
        'annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt',
        'annotated_fb_data_test.entitylinking.top20_succSet_asInput.fromMo_FB5M.txt'
    ]

    rng = numpy.random.RandomState(23455)
    word2id, char2id = load_word2id_char2id(mark)
    #     datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_test_or_valid(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size)#max_char_len, max_des_len, max_relation_len, max_Q_len

    datasets_test, length_per_example_test, word2id, char2id = load_test_or_valid(
        triple_files[1], char2id, word2id, max_char_len, max_des_len,
        max_relation_len, max_Q_len, test_size)
    vocab_size = len(word2id)
    char_size = len(char2id)
    print 'vocab_size:', vocab_size, 'char_size:', char_size

    #     train_data=datasets
    #     valid_data=datasets[1]
    test_data = datasets_test
    #     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
    #
    #     train_pos_entity_char=train_data[0]
    #     train_pos_entity_des=train_data[1]
    #     train_relations=train_data[2]
    #     train_entity_char_lengths=train_data[3]
    #     train_entity_des_lengths=train_data[4]
    #     train_relation_lengths=train_data[5]
    #     train_mention_char_ids=train_data[6]
    #     train_remainQ_word_ids=train_data[7]
    #     train_mention_char_lens=train_data[8]
    #     train_remainQ_word_len=train_data[9]
    #     train_entity_scores=train_data[10]

    test_pos_entity_char = test_data[0]
    #    test_pos_entity_des=test_data[1]
    test_relations = test_data[2]
    test_entity_char_lengths = test_data[3]
    #    test_entity_des_lengths=test_data[4]
    test_relation_lengths = test_data[5]
    test_mention_char_ids = test_data[6]
    test_remainQ_word_ids = test_data[7]
    test_mention_char_lens = test_data[8]
    test_remainQ_word_len = test_data[9]
    test_entity_scores = test_data[10]
    #
    #     test_pos_entity_char=test_data[0]       #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
    #     test_pos_entity_des=test_data[1]        #matrix, each row for a examle: 20*2*51
    #     test_relations=test_data[2]             #matrix, each row for a example: 5*51
    #     test_entity_char_lengths=test_data[3]   #matrix, each row for a example: 3*2*51  (three valies for one entity)
    #     test_entity_des_lengths=test_data[4]    #matrix, each row for a example: 3*2*51  (three values for one entity)
    #     test_relation_lengths=test_data[5]      #matrix, each row for a example: 3*51
    #     test_mention_char_ids=test_data[6]      #matrix, each row for a mention: 40
    #     test_remainQ_word_ids=test_data[7]      #matrix, each row for a question: 30
    #     test_mention_char_lens=test_data[8]     #matrix, each three values for a mention: 3
    #     test_remainQ_word_len=test_data[9]      #matrix, each three values for a remain question: 3

    #     train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
    #            len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)]
    #     if sum(train_sizes)/len(train_sizes)!=train_size:
    #         print 'weird size:', train_sizes
    #         exit(0)

    test_sizes=[len(test_pos_entity_char), len(test_relations), len(test_entity_char_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)]
    if sum(test_sizes) / len(test_sizes) != test_size:
        print 'weird size:', test_sizes
        exit(0)


#     n_train_batches=train_size/batch_size
#     n_test_batches=test_size/batch_size

#     train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
#     test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

#     indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char)
#     indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des)
#     indices_train_relations=pythonList_into_theanoIntMatrix(train_relations)
#     indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths)
#     indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths)
#     indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths)
#     indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids)
#     indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids)
#     indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens)
#     indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len)
#     indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores)

#     indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char)
#     indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des)
#     indices_test_relations=pythonList_into_theanoIntMatrix(test_relations)
#     indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths)
#     indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths)
#     indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths)
#     indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids)
#     indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids)
#     indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens)
#     indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len)
#     indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores)

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    #     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    #     rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    char_rand_values = random_value_normal((char_size + 1, char_emb_size),
                                           theano.config.floatX,
                                           numpy.random.RandomState(1234))
    #     char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX)
    char_embeddings = theano.shared(value=char_rand_values, borrow=True)

    # allocate symbolic variables for the data
    index = T.iscalar()
    chosed_indices = T.ivector()

    ent_char_ids_M = T.imatrix()
    ent_lens_M = T.imatrix()
    men_char_ids_M = T.imatrix()
    men_lens_M = T.imatrix()
    rel_word_ids_M = T.imatrix()
    rel_word_lens_M = T.imatrix()
    #desH_word_ids_M=T.imatrix()
    #desH_word_lens_M=T.imatrix()
    q_word_ids_M = T.imatrix()
    q_word_lens_M = T.imatrix()
    ent_scores = T.fvector()

    filter_size = (emb_size, window_width)
    char_filter_size = (char_emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    char_filter_shape = (char_nkerns, 1, char_filter_size[0],
                         char_filter_size[1])
    word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b = create_conv_para(rng,
                                                filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b = create_conv_para(
        rng, filter_shape=word_filter_shape)
    #q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    params = [
        char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W,
        q_rel_conv_b
    ]  #, q_desH_conv_W, q_desH_conv_b]
    load_model_from_file(rootPath, params, mark)

    def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f,
                               rel_word_lens_f, men_char_ids_f, q_word_ids_f,
                               men_lens_f, q_word_lens_f):

        #         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape(
            (batch_size, max_char_len,
             char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape(
            (batch_size, max_char_len,
             char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape(
            (batch_size, max_relation_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        #desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        #         desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape(
            (batch_size, max_Q_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        #ent_mention
        ent_char_conv = Conv_with_input_para(rng,
                                             input=ent_char_input,
                                             image_shape=(batch_size, 1,
                                                          char_emb_size,
                                                          max_char_len),
                                             filter_shape=char_filter_shape,
                                             W=char_conv_W,
                                             b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng,
                                             input=men_char_input,
                                             image_shape=(batch_size, 1,
                                                          char_emb_size,
                                                          max_char_len),
                                             filter_shape=char_filter_shape,
                                             W=char_conv_W,
                                             b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng,
                                          input=q_word_input,
                                          image_shape=(batch_size, 1, emb_size,
                                                       max_Q_len),
                                          filter_shape=word_filter_shape,
                                          W=q_rel_conv_W,
                                          b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng,
                                        input=rel_word_input,
                                        image_shape=(batch_size, 1, emb_size,
                                                     max_relation_len),
                                        filter_shape=word_filter_shape,
                                        W=q_rel_conv_W,
                                        b=q_rel_conv_b)
        #q_desH
        #q_desH_conv = Conv_with_input_para(rng, input=q_word_input,
        #        image_shape=(batch_size, 1, emb_size, max_Q_len),
        #        filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
        #desH_conv = Conv_with_input_para(rng, input=desH_word_input,
        #        image_shape=(batch_size, 1, emb_size, max_des_len),
        #        filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)

        ent_conv_pool = Max_Pooling(rng,
                                    input_l=ent_char_conv.output,
                                    left_l=ent_lens_f[0],
                                    right_l=ent_lens_f[2])
        men_conv_pool = Max_Pooling(rng,
                                    input_l=men_char_conv.output,
                                    left_l=men_lens_f[0],
                                    right_l=men_lens_f[2])

        #q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool = Max_Pooling(rng,
                                    input_l=rel_conv.output,
                                    left_l=rel_word_lens_f[0],
                                    right_l=rel_word_lens_f[2])
        q_rel_pool = Average_Pooling_for_SimpleQA(
            rng,
            input_l=q_rel_conv.output,
            input_r=rel_conv_pool.output_maxpooling,
            left_l=q_word_lens_f[0],
            right_l=q_word_lens_f[2],
            length_l=q_word_lens_f[1] + filter_size[1] - 1,
            dim=max_Q_len + filter_size[1] - 1,
            topk=2)

        #q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        #desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2])


        overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)*0.33333+\
                    cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)*0.55
        #           0.0*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling)
        #                     cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi

    simi_list, updates = theano.scan(SimpleQ_matches_Triple,
                                     sequences=[
                                         ent_char_ids_M, ent_lens_M,
                                         rel_word_ids_M, rel_word_lens_M,
                                         men_char_ids_M, q_word_ids_M,
                                         men_lens_M, q_word_lens_M
                                     ])

    simi_list += 0.2 * ent_scores

    posi_simi = simi_list[0]
    nega_simies = simi_list[1:]
    loss_simi_list = T.maximum(
        0.0, margin - posi_simi.reshape((1, 1)) + nega_simies)
    loss_simi = T.sum(loss_simi_list)

    test_model = theano.function([
        ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M,
        rel_word_lens_M, q_word_ids_M, q_word_lens_M, ent_scores
    ], [loss_simi, simi_list],
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... testing'

    start_time = time.clock()
    mid_time = start_time

    epoch = 0

    test_loss = []
    succ = 0
    for i in range(test_size):

        #prepare data
        test_ent_char_ids_M = numpy.asarray(test_pos_entity_char[i],
                                            dtype='int32').reshape(
                                                (length_per_example_test[i],
                                                 max_char_len))
        test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i],
                                        dtype='int32').reshape(
                                            (length_per_example_test[i], 3))
        test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i],
                                            dtype='int32').reshape(
                                                (length_per_example_test[i],
                                                 max_char_len))
        test_men_lens_M = numpy.asarray(test_mention_char_lens[i],
                                        dtype='int32').reshape(
                                            (length_per_example_test[i], 3))
        test_rel_word_ids_M = numpy.asarray(test_relations[i],
                                            dtype='int32').reshape(
                                                (length_per_example_test[i],
                                                 max_relation_len))
        test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i],
                                             dtype='int32').reshape(
                                                 (length_per_example_test[i],
                                                  3))
        #test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int32').reshape((length_per_example_test[i], max_des_len))
        #test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int32').reshape((length_per_example_test[i], 3))
        test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i],
                                          dtype='int32').reshape(
                                              (length_per_example_test[i],
                                               max_Q_len))
        test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i],
                                           dtype='int32').reshape(
                                               (length_per_example_test[i], 3))
        test_ent_scores = numpy.asarray(test_entity_scores[i],
                                        dtype=theano.config.floatX)

        loss_simi_i, simi_list_i = test_model(
            test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M,
            test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M,
            test_q_word_ids_M, test_q_word_lens_M, test_ent_scores)
        #                     print 'simi_list_i:', simi_list_i[:10]
        test_loss.append(loss_simi_i)
        if len(simi_list_i) == 1 or simi_list_i[0] >= max(simi_list_i[1:]):
            succ += 1
        if i % 1000 == 0:
            print 'testing', i, '...acc:', (succ * 1.0 /
                                            (i + 1)) * (19168 * 1.0 / 21687)
    succ = succ * 100.0 / 21687
    #now, check MAP and MRR
    print 'accu:', succ

    #     store_model_to_file(rootPath, params, succ, mark)

    print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
Example #39
0
def evaluate_lenet5(file_name,
                    input_filename,
                    model_filename,
                    learning_rate=0.001,
                    n_epochs=2000,
                    nkerns=[90, 90],
                    batch_size=1,
                    window_width=2,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=128,
                    max_d_length=128,
                    margin=0.3):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    f = open(file_name, 'w')
    f.write("model options " + str(model_options) + '\n')
    #rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/';
    rng = numpy.random.RandomState(23455)
    train_data, _train_Label, train_size, test_data, _test_Label, test_size, vocab_size = load_MCTest_corpus_DPN(
        'vocab_table_wenyan.txt', input_filename, input_filename, max_s_length,
        maxSentLength, maxDocLength)  #vocab_size contain train, dev and test
    f.write('train_size : ' + str(train_size))
    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    #     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    #     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    #     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

    # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label),
    #          numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4),
    #         numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4),
    #         numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)]
    # return results, line_control
    [
        train_data_D, train_data_A1, train_Label, train_Length_D,
        train_Length_D_s, train_Length_A1, train_leftPad_D, train_leftPad_D_s,
        train_leftPad_A1, train_rightPad_D, train_rightPad_D_s,
        train_rightPad_A1
    ] = train_data
    [
        test_data_D, test_data_A1, test_Label, test_Length_D, test_Length_D_s,
        test_Length_A1, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_A1
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values, 'vectors_wenyan2.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    #   index_Q = T.lvector()
    index_A1 = T.lvector()
    #    index_A2= T.lvector()
    #     index_A3= T.lvector()
    #     index_A4= T.lvector()
    y = T.lscalar()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    #     len_Q=T.lscalar()
    len_A1 = T.lscalar()
    #    len_A2=T.lscalar()
    #     len_A3=T.lscalar()
    #     len_A4=T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    #     left_Q=T.lscalar()
    left_A1 = T.lscalar()
    #    left_A2=T.lscalar()
    #     left_A3=T.lscalar()
    #     left_A4=T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    #     right_Q=T.lscalar()
    right_A1 = T.lscalar()
    #    right_A2=T.lscalar()
    #     right_A3=T.lscalar()
    #     right_A4=T.lscalar()

    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    f.write('... building the model\n')

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape(
        (batch_size, maxSentLength,
         emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    #layer0_A2_input = embeddings[index_A2.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    #     layer0_A3_input = embeddings[index_A3.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    #     layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(nkerns[0], 1,
                                                    filter_words[0],
                                                    filter_words[1]))
    layer0_para = [conv_W, conv_b]
    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    layer2_para = [conv2_W, conv2_b]
    high_W, high_b = create_highw_para(
        rng, nkerns[0], nkerns[1]
    )  # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para = [high_W, high_b]
    params = layer2_para + layer0_para + highW_para  #+[embeddings]
    #load_model(params)

    layer0_D = Conv_with_input_para(
        rng,
        input=layer0_D_input,
        image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    #     layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    layer0_A1 = Conv_with_input_para(
        rng,
        input=layer0_A1_input,
        image_shape=(batch_size, 1, ishape[0], ishape[1]),
        filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]),
        W=conv_W,
        b=conv_b)
    #layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input,
    # image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #  filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    #     layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    #     layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    #     layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
    layer0_A1_output = debug_print(layer0_A1.output, 'layer0_A1.output')
    #layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output')
    #     layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output')
    #     layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output')

    #     layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA1 = Average_Pooling_Scan(rng,
                                      input_D=layer0_D_output,
                                      input_r=layer0_A1_output,
                                      kern=nkerns[0],
                                      left_D=left_D,
                                      right_D=right_D,
                                      left_D_s=left_D_s,
                                      right_D_s=right_D_s,
                                      left_r=left_A1,
                                      right_r=right_A1,
                                      length_D_s=len_D_s + filter_words[1] - 1,
                                      length_r=len_A1 + filter_words[1] - 1,
                                      dim=maxSentLength + filter_words[1] - 1,
                                      doc_len=maxDocLength,
                                      topk=1)
    #layer1_DA2=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0],
    # left_D=left_D, right_D=right_D,
    # left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A2, right_r=right_A2,
    # length_D_s=len_D_s+filter_words[1]-1, length_r=len_A2+filter_words[1]-1,
    #  dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    #     layer1_DA3=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A3, right_r=right_A3,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_A3+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    #     layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)

    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    #     layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA1 = Conv_with_input_para(
        rng,
        input=layer1_DA1.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #layer2_DA2 = Conv_with_input_para(rng, input=layer1_DA2.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #  image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #  filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_DA3 = Conv_with_input_para(rng, input=layer1_DA3.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    #     layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA1.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #   image_shape=(batch_size, 1, nkerns[0], 1),
    # filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A1_output_sent_rep_Dlevel = debug_print(
        layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    #     layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    #     layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')
    #     layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel')

    #     layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA1 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA1.output,
        input_r=layer2_A1_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=1)
    #layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1],
    #   left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #   length_l=len_D+filter_sents[1]-1, length_r=1,
    #    dim=maxDocLength+filter_sents[1]-1, topk=3)
    #     layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    #     layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)

    #high-way

    #     transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b),
        'transform_gate_DA1')
    transform_gate_A1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b),
        'transform_gate_A1')
    # transform_gate_A2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2')
    #     transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3')
    #     transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4')

    #     overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A1 = (
        1.0 - transform_gate_DA1
    ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep
    # overall_D_A2=(1.0-transform_gate_DA2)*layer1_DA2.output_D_sent_level_rep+transform_gate_DA2*layer3_DA2.output_D_doc_level_rep
    #     overall_D_A3=(1.0-transform_gate_DA3)*layer1_DA3.output_D_sent_level_rep+transform_gate_DA3*layer3_DA3.output_D_doc_level_rep
    #     overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep

    #     overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A1 = (
        1.0 - transform_gate_A1
    ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel
    #overall_A2=(1.0-transform_gate_A2)*layer1_DA2.output_QA_sent_level_rep+transform_gate_A2*layer2_A2.output_sent_rep_Dlevel
    #     overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel
    #     overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel

    simi_sent_level1 = debug_print(
        cosine(layer1_DA1.output_D_sent_level_rep,
               layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')
    #simi_sent_level2=debug_print(cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2')
    #     simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3')
    #     simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4')

    simi_doc_level1 = debug_print(
        cosine(layer3_DA1.output_D_doc_level_rep,
               layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')
    #simi_doc_level2=debug_print(cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2')
    #     simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3')
    #     simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4')

    simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1),
                                      'simi_overall_level1')
    #simi_overall_level2=debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2')
    #     simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3')
    #     simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4')

    #     simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1
    #     simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2

    simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0
    #simi_1 = simi_doc_level1
    #simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0
    #     simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0
    #     simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0
    logistic_w, logistic_b = create_logistic_para(rng, 1, 2)
    logistic_para = [logistic_w, logistic_b]
    sent_w, sent_b = create_logistic_para(rng, 1, 2)
    doc_w, doc_b = create_logistic_para(rng, 1, 2)
    sent_para = [sent_w, sent_b]
    doc_para = [doc_w, doc_b]
    params += logistic_para
    params += sent_para
    params += doc_para

    load_model(params, model_filename)
    simi_sent = T.dot(sent_w, simi_sent_level1) + sent_b.dimshuffle(0, 'x')
    simi_sent = simi_sent.dimshuffle(1, 0)
    simi_sent = T.nnet.softmax(simi_sent)
    tmp_sent = T.log(simi_sent)

    simi_doc = T.dot(doc_w, simi_doc_level1) + doc_b.dimshuffle(0, 'x')
    simi_doc = simi_doc.dimshuffle(1, 0)
    simi_doc = T.nnet.softmax(simi_doc)
    tmp_doc = T.log(simi_doc)
    #cost = margin - simi_1
    simi_overall = T.dot(logistic_w,
                         simi_overall_level1) + logistic_b.dimshuffle(0, 'x')
    simi_overall = simi_overall.dimshuffle(1, 0)

    simi_overall = T.nnet.softmax(simi_overall)
    predict = T.argmax(simi_overall, axis=1)
    tmp_overall = T.log(simi_overall)
    cost = -(tmp_overall[0][y] + tmp_doc[0][y] + tmp_sent[0][y]) / 3.0
    L2_reg = (conv2_W**2).sum() + (conv_W**2).sum() + (logistic_w**2).sum() + (
        high_W**2).sum()
    cost = cost + L2_weight * L2_reg
    #simi_1 = [simi_overall,simi_doc,simi_sent]
    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    #     #only use overall_simi
    #     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    #     posi_simi=simi_overall_level1
    #     nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #use ensembled simi
    #     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
    #     cost=T.maximum(0.0, margin+simi_2-simi_1)
    #cost=T.maximum(0.0, margin+simi_sent_level2-simi_sent_level1)+T.maximum(0.0, margin+simi_doc_level2-simi_doc_level1)+T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)
    #     posi_simi=simi_1
    #     nega_simi=simi_2

    #L2_reg =debug_print((high_W**2).sum()+(conv2_W**2).sum()+(conv_W**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    #cost=debug_print(cost+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index],
        [cost, simi_overall, simi_doc, simi_sent, predict],
        givens={
            index_D: test_data_D[index],  #a matrix
            #             index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            y: test_Label[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            #             len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            #            len_A2: test_Length_A2[index],
            #             len_A3: test_Length_A3[index],
            #             len_A4: test_Length_A4[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            #             left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            #            left_A2: test_leftPad_A2[index],
            #             left_A3: test_leftPad_A3[index],
            #             left_A4: test_leftPad_A4[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            #             right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
        },
        on_unused_input='ignore')

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))


#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         acc = acc_i + T.sqr(grad_i)
#         if param_i == embeddings:
#             updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size)))))   #AdaGrad
#         else:
#             updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))

    train_model = theano.function(
        [index],
        [cost, simi_overall, simi_doc, simi_sent, predict],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            #            index_A2: train_data_A2[index],
            #             index_A3: train_data_A3[index],
            #             index_A4: train_data_A4[index],
            y: train_Label[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            #            len_A2: train_Length_A2[index],
            #             len_A3: train_Length_A3[index],
            #             len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            #            left_A2: train_leftPad_A2[index],
            #             left_A3: train_leftPad_A3[index],
            #             left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            #            right_A2: train_rightPad_A2[index]
            #             right_A3: train_rightPad_A3[index],
            #             right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    f.write('... training\n')
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch
    cost, simi_overall, simi_doc, simi_sent, predict = test_model(0)
    cost, simi_overall1, simi_doc, simi_sent, predict = test_model(1)
    cost, simi_overall2, simi_doc, simi_sent, predict = test_model(2)
    cost, simi_overall3, simi_doc, simi_sent, predict = test_model(3)
    return simi_overall, simi_overall1, simi_overall2, simi_overall3
    '''