Beispiel #1
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=4,
                    emb_size=300,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=200,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_concMT_BBN_NI_epoch4.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_lines, word2id = load_official_testData_il_and_MT(
        word2id, maxSentLen, test_file_path)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    # test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + '100k-ENG-multicca.300.ENG.txt',
        emb_root + '100k-IL9-multicca.d300.IL9.txt'
    ], 300)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        [binarize_prob, ensemble_scores, sum_tensor3],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    # max_meanf1_test=0.0
    # max_weightf1_test=0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_types = []
                pred_confs = []
                pred_others = []
                for i, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    pred_types_i, pred_conf_i, pred_fields_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    if i < len(test_batch_start) - 1:
                        pred_types.append(pred_types_i)
                        pred_confs.append(pred_conf_i)
                        pred_others.append(pred_fields_i)
                    else:
                        pred_types.append(pred_types_i[-n_test_remain:])
                        pred_confs.append(pred_conf_i[-n_test_remain:])
                        pred_others.append(pred_fields_i[-n_test_remain:])
                pred_types = np.concatenate(pred_types, axis=0)
                pred_confs = np.concatenate(pred_confs, axis=0)
                pred_others = np.concatenate(pred_others, axis=0)
                mean_frame = generate_2018_official_output(
                    test_lines, output_file_path, pred_types, pred_confs,
                    pred_others, min_mean_frame)
                if mean_frame < min_mean_frame:
                    min_mean_frame = mean_frame
                print '\t\t\t test  over, min_mean_frame:', min_mean_frame

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #2
0
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3,5], maxSentLen=100, hidden_size=[300,300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id=load_BBN_multi_labels_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_sents=np.asarray(all_sentences[0], dtype='int32')
    train_masks=np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels=np.asarray(all_labels[0], dtype='int32')
    train_size=len(train_labels)

    dev_sents=np.asarray(all_sentences[1], dtype='int32')
    dev_masks=np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels=np.asarray(all_labels[1], dtype='int32')
    dev_size=len(dev_labels)

    test_sents=np.asarray(all_sentences[2], dtype='int32')
    test_masks=np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size=len(test_labels)

    comb_sents = np.concatenate([train_sents,test_sents], axis=0)
    comb_masks = np.concatenate([train_masks,test_masks], axis=0)
    comb_labels = np.asarray([0]*train_size+[1]*test_size, dtype='int32')
    comb_size = len(comb_labels)

    vocab_size=  len(word2id)+1 # add one zero pad index

    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_fasttext_multiple_word2vec_given_file([emb_root+'wiki.en.vec',emb_root+'mono-lingual-il5-xinli.vec'], 300)
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable


    #now, start to build the input form of the model
    sents_id_matrix=T.imatrix('sents_id_matrix')
    sents_mask=T.fmatrix('sents_mask')
    labels=T.imatrix('labels')  #batch*12
    domain_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input=embeddings[sents_id_matrix.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input*sents_mask.dimshuffle(0,'x',1),axis=2)
    # bow_mean_emb = bow_emb/T.sum(sents_mask,axis=1).dimshuffle(0,'x')



    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]))
    NN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_model = Conv_with_Mask(rng, input_tensor3=common_input,
             mask_matrix = sents_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings=conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(rng, input_tensor3=common_input,
             mask_matrix = sents_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2)    #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2=conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    '''
    adversarial
    '''
    domain_input = T.concatenate([sent_embeddings,sent_embeddings2, bow_emb], axis=1)
    domain_input_size = hidden_size[0]*2+emb_size
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(rng, domain_input_size, hidden_size[0])
    HL_layer_2_W, HL_layer_2_b = create_HiddenLayer_para(rng, hidden_size[0], hidden_size[0])
    adver_HL_para = [HL_layer_1_W, HL_layer_1_b, HL_layer_2_W, HL_layer_2_b]
    HL_layer_1=HiddenLayer(rng, input=domain_input, n_in=domain_input_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[0], W=HL_layer_2_W, b=HL_layer_2_b, activation=T.tanh)
    disc_para_W, disc_para_b = create_LR_para(rng,hidden_size[0],2)
    conf_para_W, conf_para_b = create_LR_para(rng,hidden_size[0],2)
    conf_para_W2, conf_para_b2 = create_LR_para(rng,hidden_size[0],2)
    adver_LR_para = [disc_para_W, disc_para_b, conf_para_W, conf_para_b,conf_para_W2, conf_para_b2]
    disc_layer_LR=LogisticRegression(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=2, W=disc_para_W, b=disc_para_b)
    conf_layer_LR=LogisticRegression(rng, input=HL_layer_2.output, n_in=hidden_size[0], n_out=2, W=conf_para_W, b=conf_para_b)
    conf2_layer_LR=LogisticRegression(rng, input=HL_layer_2.output, n_in=hidden_size[0], n_out=2, W=conf_para_W2, b=conf_para_b2)
    disc_loss = disc_layer_LR.negative_log_likelihood(domain_labels)
    conf_loss = conf_layer_LR.negative_log_likelihood_specific_label(0)
    conf_loss2 = conf2_layer_LR.negative_log_likelihood_specific_label(1)
    adver_loss = disc_loss+conf_loss+conf_loss2
    adver_para = adver_HL_para+adver_LR_para   +NN_para
    adver_updates =   Gradient_Cost_Para(adver_loss,adver_para, learning_rate)

    '''
    SF classification
    '''
    LR_input = HL_layer_2.output#T.concatenate([sent_embeddings,sent_embeddings2, bow_emb], axis=1)
    LR_input_size = hidden_size[0]
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(rng, 12, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a, LR_b]
    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where( labels < 1, 1.0-score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))


    # loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = NN_para+adver_HL_para+LR_para   # put all model parameters together
    cost=loss+1e-4*((conv_W**2).sum()+(conv_W2**2).sum())
    updates =   Gradient_Cost_Para(cost,params, learning_rate)

    '''
    testing
    '''
    binarize_prob = T.where(score_matrix > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    comb_model = theano.function([sents_id_matrix, sents_mask, domain_labels], adver_loss, updates=adver_updates, allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    n_comb_batches=comb_size/batch_size
    comb_batch_start=list(np.arange(n_comb_batches)*batch_size)+[comb_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]


    # max_acc_dev=0.0
    max_meanf1_test=0.0
    max_weightf1_test=0.0
    train_indices = range(train_size)
    comb_indices = range(comb_size)
    cost_i=0.0
    adver_cost_i=0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(comb_indices)
        iter_accu=0

        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]

            cost_i+= train_model(
                                train_sents[train_id_batch],
                                train_masks[train_id_batch],
                                train_labels[train_id_batch])

            comb_id_batch = comb_indices[batch_id:batch_id+batch_size]
            adver_cost_i+= comb_model(
                                comb_sents[comb_id_batch],
                                comb_masks[comb_id_batch],
                                comb_labels[comb_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if  iter%20==0:
                print 'Epoch ', epoch, 'iter ', iter, ' average cost: ', cost_i/iter , adver_cost_i/iter , 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()

                error_sum=0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start: # for each test batch
                    pred_labels=test_model(
                                test_sents[test_batch_id:test_batch_id+batch_size],
                                test_masks[test_batch_id:test_batch_id+batch_size])
                    gold_labels = test_labels[test_batch_id:test_batch_id+batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)


                test_mean_f1, test_weight_f1 =average_f1_two_array_by_col(all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test=test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test=test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test


        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #3
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=700,
                    il5_emb_size=300,
                    train_ratio=0.8,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 5],
                    max_il5_phrase_len=5,
                    hidden_size=300):
    model_options = locals().copy()
    print "model options", model_options
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    # srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    # en_word2vec=load_fasttext_word2vec_given_file(root+'wiki.en.vec') #
    en_word2vec = load_fasttext_word2vec_given_file(
        '/save/wenpeng/datasets/word2vec_words_300d.txt', 300)
    il5_word2vec = load_fasttext_word2vec_given_file(
        root + 'il5_300d_word2vec.txt', 300)

    english_vocab = set(en_word2vec.keys())
    il5_vocab = set(il5_word2vec.keys())

    source_ids, source_masks, target_ids, il5_word2id, english_word2id = load_trainingdata_il5(
        root, english_vocab, il5_vocab, max_il5_phrase_len)
    # print len(english_vocab)
    # print len(english_word2id)
    # print set(english_word2id.keys()) - english_vocab
    assert set(english_word2id.keys()).issubset(english_vocab)
    assert set(il5_word2id.keys()).issubset(il5_vocab)

    data_size = len(target_ids)
    train_size = int(data_size * train_ratio)
    dev_size = data_size - train_size
    print 'trainin size: ', train_size, ' dev_size: ', dev_size

    # all_sentences, all_masks, all_labels, word2id=load_BBN_multi_labels_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(source_ids[:train_size], dtype='int32')
    train_masks = np.asarray(source_masks[:train_size],
                             dtype=theano.config.floatX)
    train_target = np.asarray(target_ids[:train_size], dtype='int32')

    dev_sents = np.asarray(source_ids[-dev_size:], dtype='int32')
    dev_masks = np.asarray(source_masks[-dev_size:],
                           dtype=theano.config.floatX)
    dev_target = np.asarray(target_ids[-dev_size:], dtype='int32')

    en_vocab_size = len(english_word2id)
    en_rand_values = rng.normal(
        0.0, 0.01,
        (en_vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    en_id2word = {y: x for x, y in english_word2id.iteritems()}
    en_rand_values = load_word2vec_to_init(en_rand_values, en_id2word,
                                           en_word2vec)
    en_embeddings = theano.shared(
        value=np.array(en_rand_values, dtype=theano.config.floatX),
        borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    il5_vocab_size = len(il5_word2id) + 1  # add one zero pad index
    il5_rand_values = rng.normal(
        0.0, 0.01, (il5_vocab_size,
                    il5_emb_size))  #generate a matrix by Gaussian distribution
    il5_id2word = {y: x for x, y in il5_word2id.iteritems()}
    il5_rand_values = load_word2vec_to_init(il5_rand_values, il5_id2word,
                                            il5_word2vec)
    il5_embeddings = theano.shared(
        value=np.array(il5_rand_values, dtype=theano.config.floatX),
        borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    # source_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
    # target_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    batch_ids = T.imatrix()  #(batch, maxlen)
    batch_masks = T.fmatrix()  #(batch, maxlen)
    batch_targets = T.ivector()
    batch_test_source = T.fmatrix()  #(batch, emb_size)

    input_batch = il5_embeddings[batch_ids.flatten()].reshape(
        (batch_size, max_il5_phrase_len, il5_emb_size))  #(batch, emb_size)
    masked_input_batch = T.sum(input_batch * batch_masks.dimshuffle(0, 1, 'x'),
                               axis=1)  #(batch, emb_size)
    # masked_input_batch = masked_input_batch / T.sum(batch_masks,axis=1).dimshuffle(0,'x')

    target_embs = en_embeddings[batch_targets]

    HL_layer_W1, HL_layer_b1 = create_HiddenLayer_para(rng, hidden_size,
                                                       hidden_size)
    HL_layer_W2, HL_layer_b2 = create_HiddenLayer_para(rng, hidden_size,
                                                       hidden_size)
    HL_layer_W3, HL_layer_b3 = create_HiddenLayer_para(rng, hidden_size,
                                                       hidden_size)
    HL_layer_params = [
        HL_layer_W3, HL_layer_b3
    ]  #][HL_layer_W1, HL_layer_b1, HL_layer_W2, HL_layer_b2, HL_layer_W3, HL_layer_b3]
    #doc, by pos
    # HL_layer_1=HiddenLayer(rng, input=masked_input_batch, n_in=il5_emb_size, n_out=emb_size, W=HL_layer_W1, b=HL_layer_b1, activation=T.nnet.relu)
    # HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=emb_size, n_out=emb_size, W=HL_layer_W2, b=HL_layer_b2, activation=T.nnet.relu)
    HL_layer_3 = HiddenLayer(rng,
                             input=masked_input_batch,
                             n_in=emb_size,
                             n_out=emb_size,
                             W=HL_layer_W3,
                             b=HL_layer_b3,
                             activation=T.tanh)

    batch_raw_pred = HL_layer_3.output
    # batch_pred = batch_raw_pred/T.sqrt(1e-8+T.sum(batch_raw_pred**2, axis=1)).dimshuffle(0,'x')
    batch_cosine = T.mean(
        cosine_row_wise_twoMatrix(
            batch_raw_pred,
            target_embs))  #T.mean(T.sum(batch_pred *target_embs, axis=1))
    batch_distance = T.mean(
        T.sqrt(1e-8 + T.sum((batch_raw_pred - target_embs)**2, axis=1)))
    cos_loss = -T.log(1.0 + batch_cosine)  #1.0-batch_cosine
    loss = -T.log(1.0 / (1.0 + batch_distance))

    params = HL_layer_params  # put all model parameters together
    cost = cos_loss + loss  #+1e-4*((HL_layer_W3**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([batch_ids, batch_masks, batch_targets],
                                  loss,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([batch_ids, batch_masks, batch_targets],
                                 batch_cosine,
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = dev_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [dev_size - batch_size]

    train_indices = range(train_size)
    cost_i = 0.0
    max_cosine = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_target[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                test_loss = 0.0
                for test_batch_id in test_batch_start:  # for each test batch
                    test_loss_i = test_model(
                        dev_sents[test_batch_id:test_batch_id + batch_size],
                        dev_masks[test_batch_id:test_batch_id + batch_size],
                        dev_target[test_batch_id:test_batch_id + batch_size])

                    test_loss += test_loss_i
                test_loss /= len(test_batch_start)

                if test_loss > max_cosine:
                    max_cosine = test_loss
                print '\t\t\t\t\t\t\t\tcurrent mean_cosin:', test_loss, ' max cosine: ', max_cosine

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=3,
                    maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200,
                    L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0, decay=0.95):
    maxSentLength=max_s_length+2*(window_width-1)
    maxDocLength=max_d_length+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/';
    rng = numpy.random.RandomState(23455)
    train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DQAAAA(rootPath+'vocab_DQAAAA.txt', rootPath+'mc500.train.tsv_standardlized.txt_DQAAAA.txt', rootPath+'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test


    [train_data_D, train_data_Q, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, 
                 train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4,
                train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4,
                train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4]=train_data
    [test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, 
                 test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4,
                test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4,
                test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4]=test_data                


    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int64')
#     indices_train_r=T.cast(indices_train_r, 'int64')
#     indices_test_l=T.cast(indices_test_l, 'int64')
#     indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DQAAAA_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()   # now, x is the index matrix, must be integer
    index_Q = T.lvector()
    index_A1= T.lvector()
    index_A2= T.lvector()
    index_A3= T.lvector()
    index_A4= T.lvector()
#     y = T.lvector()  
    
    len_D=T.lscalar()
    len_D_s=T.lvector()
    len_Q=T.lscalar()
    len_A1=T.lscalar()
    len_A2=T.lscalar()
    len_A3=T.lscalar()
    len_A4=T.lscalar()

    left_D=T.lscalar()
    left_D_s=T.lvector()
    left_Q=T.lscalar()
    left_A1=T.lscalar()
    left_A2=T.lscalar()
    left_A3=T.lscalar()
    left_A4=T.lscalar()

    right_D=T.lscalar()
    right_D_s=T.lvector()
    right_Q=T.lscalar()
    right_A1=T.lscalar()
    right_A2=T.lscalar()
    right_A3=T.lscalar()
    right_A4=T.lscalar()
        


    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength) # doc shape
    filter_words=(emb_size,window_width)
    filter_sents=(nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1), 'layer0_D_input')#.dimshuffle(0, 'x', 1, 2)
    layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_Q_input')#.dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_A1_input')#.dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2)
    layer0_A4_input = embeddings[index_A4.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2)
    
        
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b] 
#     conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
#     layer2_para=[conv2_W, conv2_b]
#     high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1])
#     highW_para=[high_W, high_b]

    #load_model(params)
    
    
    layer0_D = GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D,:,:],
                                 lefts=left_D_s[left_D:-right_D],
                                 rights=right_D_s[left_D:-right_D],
                                 hidden_dim=nkerns[0],
                                 U=U,W=W,b=b)
    layer0_Q = GRU_Matrix_Input(X=layer0_Q_input[:,left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A1 = GRU_Matrix_Input(X=layer0_A1_input[:,left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A2 = GRU_Matrix_Input(X=layer0_A2_input[:,left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A3 = GRU_Matrix_Input(X=layer0_A3_input[:,left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A4 = GRU_Matrix_Input(X=layer0_A4_input[:,left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)

    
    layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output')
    layer0_Q_output=debug_print(layer0_Q.output_vector_mean, 'layer0_Q.output')
    layer0_A1_output=debug_print(layer0_A1.output_vector_mean, 'layer0_A1.output')
    layer0_A2_output=debug_print(layer0_A2.output_vector_mean, 'layer0_A2.output')
    layer0_A3_output=debug_print(layer0_A3.output_vector_mean, 'layer0_A3.output')
    layer0_A4_output=debug_print(layer0_A4.output_vector_mean, 'layer0_A4.output')
    
    #before reasoning, do a GRU for doc: d
    U_d, W_d, b_d=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_d_para=[U_d, W_d, b_d]
    layer_D_GRU = GRU_Matrix_Input(X=layer0_D_output, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d,W=W_d,b=b_d,bptt_truncate=-1)
    
    #Reasoning Layer 1
    repeat_Q=debug_print(T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0],1)), maxDocLength, axis=1)[:,:layer_D_GRU.output_matrix.shape[1]], 'repeat_Q')
    input_DNN=debug_print(T.concatenate([layer_D_GRU.output_matrix,repeat_Q], axis=0).transpose(), 'input_DNN')#each row is an example
    output_DNN1=HiddenLayer(rng, input=input_DNN, n_in=nkerns[0]*2, n_out=nkerns[0])
    output_DNN2=HiddenLayer(rng, input=output_DNN1.output, n_in=nkerns[0], n_out=nkerns[0])
    
    DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out')
    U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_pooling_para=[U_p, W_p, b_p] 
    pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1)
    translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1')


    #before reasoning, do a GRU for doc: d2
    U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_d2_para=[U_d2, W_d2, b_d2]
    layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1)
    #Reasoning Layer 2
    repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1')
    input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example
    output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0])
    output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0])
    
    DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2')
    U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_pooling_para2=[U_p2, W_p2, b_p2] 
    pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1)
    translated_Q2=debug_print(pooling2.output_vector_max, 'translated_Q2')
    

    QA1=T.concatenate([translated_Q2, layer0_A1_output], axis=0)
    QA2=T.concatenate([translated_Q2, layer0_A2_output], axis=0)
    QA3=T.concatenate([translated_Q2, layer0_A3_output], axis=0)
    QA4=T.concatenate([translated_Q2, layer0_A4_output], axis=0)
    
    W_HL,b_HL=create_HiddenLayer_para(rng, n_in=nkerns[0]*2, n_out=1)
    match_params=[W_HL,b_HL]
    QA1_match=HiddenLayer(rng, input=QA1, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    QA2_match=HiddenLayer(rng, input=QA2, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    QA3_match=HiddenLayer(rng, input=QA3, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    QA4_match=HiddenLayer(rng, input=QA4, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    
    
    
#     simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1')
#     simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2')
#     simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3')
#     simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4')

    simi_overall_level1=debug_print(QA1_match.output[0], 'simi_overall_level1')
    simi_overall_level2=debug_print(QA2_match.output[0], 'simi_overall_level2')
    simi_overall_level3=debug_print(QA3_match.output[0], 'simi_overall_level3')
    simi_overall_level4=debug_print(QA4_match.output[0], 'simi_overall_level4')


#     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))
 
    #only use overall_simi    
    cost=T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level3-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level4-simi_overall_level1)
    
#     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    posi_simi=simi_overall_level1
    nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
#     #use ensembled simi
#     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
#     posi_simi=simi_1
#     nega_simi=T.max([simi_2, simi_3, simi_4])


    
    L2_reg =debug_print((U**2).sum()+(W**2).sum()
                        +(U_p**2).sum()+(W_p**2).sum()
                        +(U_p2**2).sum()+(W_p2**2).sum()
                        +(U_d**2).sum()+(W_d**2).sum()
                        +(U_d2**2).sum()+(W_d2**2).sum()
                        +(output_DNN1.W**2).sum()+(output_DNN2.W**2).sum()
                        +(output_DNN3.W**2).sum()+(output_DNN4.W**2).sum()
                        +(W_HL**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost=debug_print(cost+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    


    
    test_model = theano.function([index], [cost, posi_simi, nega_simi],
          givens={
            index_D: test_data_D[index], #a matrix
            index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
            index_A4: test_data_A4[index],

            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
            len_A4: test_Length_A4[index],

            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
            left_A4: test_leftPad_A4[index],
        
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index],
            right_A4: test_rightPad_A4[index]
            
            }, on_unused_input='ignore')


    params = layer0_para+output_DNN1.params+output_DNN2.params+output_DNN3.params+output_DNN4.params+layer_pooling_para+layer_pooling_para2+match_params+layer_d_para+layer_d2_para
    
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)


#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6)))   
#         updates.append((acc_i, acc))      
 
    def AdaDelta_updates(parameters,gradients,rho,eps):
        # create variables to store intermediate updates
        gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ]
        deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ]
     
        # calculates the new "average" delta for the next iteration
        gradients_sq_new = [ rho*g_sq + (1-rho)*(g**2) for g_sq,g in zip(gradients_sq,gradients) ]
     
        # calculates the step in direction. The square root is an approximation to getting the RMS for the average value
        deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ]
     
        # calculates the new "average" deltas for the next step.
        deltas_sq_new = [ rho*d_sq + (1-rho)*(d**2) for d_sq,d in zip(deltas_sq,deltas) ]
     
        # Prepare it as a list f
        gradient_sq_updates = zip(gradients_sq,gradients_sq_new)
        deltas_sq_updates = zip(deltas_sq,deltas_sq_new)
        parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ]
        return gradient_sq_updates + deltas_sq_updates + parameters_updates   
    
    updates=AdaDelta_updates(params, grads, decay, 1e-6)
  
    train_model = theano.function([index], [cost, posi_simi, nega_simi], updates=updates,
          givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost, posi_simi, nega_simi],
          givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
#         shuffle(train_batch_start)#shuffle training data


        corr_train=0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
            sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) )
            sys.stdout.flush()

            minibatch_index=minibatch_index+1
            
            cost_average, posi_simi, nega_simi= train_model(batch_start)
            if posi_simi>nega_simi:
                corr_train+=1
            
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train*100.0/train_size)

            
            if iter % validation_frequency == 0:
                corr_test=0
                for i in test_batch_start:
                    cost, posi_simi, nega_simi=test_model(i)
                    if posi_simi>nega_simi:
                        corr_test+=1

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc=corr_test*1.0/test_size
                #test_acc=1-test_score
                print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 

  
                find_better=False
                if test_acc > max_acc:
                    max_acc=test_acc
                    best_epoch=epoch    
                    find_better=True             
                print '\t\t\ttest_acc:', test_acc, 'max:',    max_acc,'(at',best_epoch,')'
                if find_better==True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'  

            if patience <= iter:
                done_looping = True
                break
        
        
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
        #writefile.close()
   
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Beispiel #5
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=70,
                    filter_size=[3, 1],
                    maxSentLen=70,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered"
    "words keep random initialization"
    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in'
    'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) '
    embed_input_l = init_embeddings[sents_ids_l.flatten(
    )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)
    '''create parameters for attentive convolution function '''
    gate_filter_shape = (emb_size, 1, emb_size, 1)
    conv_W_pre, conv_b_pre = create_conv_para(rng,
                                              filter_shape=gate_filter_shape)
    conv_W_gate, conv_b_gate = create_conv_para(rng,
                                                filter_shape=gate_filter_shape)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    conv_W2_context, conv_b2_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, conv_W_context, conv_W_pre, conv_b_pre, conv_W_gate,
        conv_b_gate, conv_W2, conv_b2, conv_W2_context
    ]

    "A gated convolution layer to form more expressive word representations in each sentence"
    "input tensor3 (batch_size, emb_size, sen_length), output tensor3 (batch_size, emb_size, sen_length)"
    conv_layer_gate_l = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_l,
        mask_matrix=sents_mask_l,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    conv_layer_gate_r = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_r,
        mask_matrix=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    '''
    attentive convolution function, two sizes of filter_width 3&1 are used. Multi-channel
    '''

    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W2,
        b=conv_b2,
        W_context=conv_W2_context,
        b_context=conv_b2_context)
    attentive_sent_embeddings_l2 = attentive_conv_layer2.attentive_maxpool_vec_l
    attentive_sent_embeddings_r2 = attentive_conv_layer2.attentive_maxpool_vec_r

    "Batch normalization for the four output sentence representation vectors"
    gamma = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[0]),
        high=1.0 / math.sqrt(hidden_size[0]),
        size=(hidden_size[0])),
                                     dtype=theano.config.floatX),
                          borrow=True)
    beta = theano.shared(np.zeros((hidden_size[0]),
                                  dtype=theano.config.floatX),
                         borrow=True)
    bn_params = [gamma, beta]
    bn_attentive_sent_embeddings_l = batch_normalization(
        inputs=attentive_sent_embeddings_l,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r = batch_normalization(
        inputs=attentive_sent_embeddings_r,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r.std((0, ), keepdims=True),
        mode='low_mem')

    bn_attentive_sent_embeddings_l2 = batch_normalization(
        inputs=attentive_sent_embeddings_l2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l2.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r2 = batch_normalization(
        inputs=attentive_sent_embeddings_r2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r2.std((0, ), keepdims=True),
        mode='low_mem')

    "Before logistic regression layer, we insert a hidden layer. Now form input to HL classifier"
    HL_layer_1_input = T.concatenate([
        bn_attentive_sent_embeddings_l, bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l + bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l * bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l2, bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 + bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 * bn_attentive_sent_embeddings_r2
    ],
                                     axis=1)
    HL_layer_1_input_size = 8 * hidden_size[0]
    "Create hidden layer parameters"
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, HL_layer_1_input_size, hidden_size[1])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    "Hidden Layer and batch norm to its output again"
    HL_layer_1 = HiddenLayer(rng,
                             input=HL_layer_1_input,
                             n_in=HL_layer_1_input_size,
                             n_out=hidden_size[1],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    gamma_HL = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[1]),
        high=1.0 / math.sqrt(hidden_size[1]),
        size=(hidden_size[1])),
                                        dtype=theano.config.floatX),
                             borrow=True)
    beta_HL = theano.shared(np.zeros((hidden_size[1]),
                                     dtype=theano.config.floatX),
                            borrow=True)
    bn_params_HL = [gamma_HL, beta_HL]
    bn_HL_output = batch_normalization(inputs=HL_layer_1.output,
                                       gamma=gamma_HL,
                                       beta=beta_HL,
                                       mean=HL_layer_1.output.mean(
                                           (0, ), keepdims=True),
                                       std=HL_layer_1.output.std(
                                           (0, ), keepdims=True),
                                       mode='low_mem')
    "Form input to LR classifier"
    LR_input = T.concatenate([HL_layer_1_input, bn_HL_output], axis=1)
    LR_input_size = HL_layer_1_input_size + hidden_size[1]
    U_a = create_ensemble_para(rng, 3, LR_input_size)  # (input_size, 3)
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    "Logistic Regression layer"
    layer_LR = LogisticRegression(
        rng,
        input=normalize_matrix_col_wise(LR_input),
        n_in=LR_input_size,
        n_out=3,
        W=U_a,
        b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [
        init_embeddings
    ] + NN_para + LR_para + bn_params + HL_layer_1_params + bn_params_HL
    cost = loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            if (epoch == 1 and iter % 1000 == 0) or (epoch >= 2
                                                     and iter % 5 == 0):
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                dev_error_sum = 0.0
                for dev_batch_id in dev_batch_start:  # for each test batch
                    dev_error_i = dev_model(
                        dev_sents_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_sents_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_labels_store[dev_batch_id:dev_batch_id +
                                         batch_size])

                    dev_error_sum += dev_error_i
                dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start))

                if dev_acc > max_acc_dev:
                    max_acc_dev = dev_acc
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev
                    '''
                    best dev model, test
                    '''
                    error_sum = 0.0
                    for test_batch_id in test_batch_start:  # for each test batch
                        error_i = test_model(
                            test_sents_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_sents_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_labels_store[test_batch_id:test_batch_id +
                                              batch_size])

                        error_sum += error_i
                    test_acc = 1.0 - error_sum / (len(test_batch_start))

                    if test_acc > max_acc_test:
                        max_acc_test = test_acc
                    print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test
                else:
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Beispiel #6
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    nkerns=[50, 50],
                    batch_size=1,
                    window_width=3,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=57,
                    max_d_length=59,
                    margin=1.0,
                    decay=0.95):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/'
    rng = numpy.random.RandomState(23455)
    train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DQAAAA(
        rootPath + 'vocab_DQAAAA.txt',
        rootPath + 'mc500.train.tsv_standardlized.txt_DQAAAA.txt',
        rootPath + 'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length,
        maxSentLength, maxDocLength)  #vocab_size contain train, dev and test

    [
        train_data_D, train_data_Q, train_data_A1, train_data_A2,
        train_data_A3, train_data_A4, train_Label, train_Length_D,
        train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2,
        train_Length_A3, train_Length_A4, train_leftPad_D, train_leftPad_D_s,
        train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3,
        train_leftPad_A4, train_rightPad_D, train_rightPad_D_s,
        train_rightPad_Q, train_rightPad_A1, train_rightPad_A2,
        train_rightPad_A3, train_rightPad_A4
    ] = train_data
    [
        test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3,
        test_data_A4, test_Label, test_Length_D, test_Length_D_s,
        test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3,
        test_Length_A4, test_leftPad_D, test_leftPad_D_s, test_leftPad_Q,
        test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1,
        test_rightPad_A2, test_rightPad_A3, test_rightPad_A4
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(
        rand_values, rootPath + 'vocab_DQAAAA_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    index_Q = T.lvector()
    index_A1 = T.lvector()
    index_A2 = T.lvector()
    index_A3 = T.lvector()
    index_A4 = T.lvector()
    #     y = T.lvector()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    len_Q = T.lscalar()
    len_A1 = T.lscalar()
    len_A2 = T.lscalar()
    len_A3 = T.lscalar()
    len_A4 = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    left_Q = T.lscalar()
    left_A1 = T.lscalar()
    left_A2 = T.lscalar()
    left_A3 = T.lscalar()
    left_A4 = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    right_Q = T.lscalar()
    right_A1 = T.lscalar()
    right_A2 = T.lscalar()
    right_A3 = T.lscalar()
    right_A4 = T.lscalar()

    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1),
                                 'layer0_D_input')  #.dimshuffle(0, 'x', 1, 2)
    layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape(
        (maxSentLength, emb_size)).transpose(),
                                 'layer0_Q_input')  #.dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape(
        (maxSentLength,
         emb_size)).transpose(), 'layer0_A1_input')  #.dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()  #.dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()  #.dimshuffle(0, 'x', 1, 2)
    layer0_A4_input = embeddings[index_A4.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()  #.dimshuffle(0, 'x', 1, 2)

    U, W, b, Ub, Wb, bb = create_Bi_GRU_para(rng, emb_size, nkerns[0])
    layer0_para = [U, W, b, Ub, Wb, bb]
    #     conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
    #     layer2_para=[conv2_W, conv2_b]
    #     high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1])
    #     highW_para=[high_W, high_b]

    #load_model(params)

    layer0_D = Bi_GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D, :, :],
                                    lefts=left_D_s[left_D:-right_D],
                                    rights=right_D_s[left_D:-right_D],
                                    hidden_dim=nkerns[0],
                                    U=U,
                                    W=W,
                                    b=b,
                                    Ub=Ub,
                                    Wb=Wb,
                                    bb=bb)
    layer0_Q = Bi_GRU_Matrix_Input(X=layer0_Q_input[:, left_Q:-right_Q],
                                   word_dim=emb_size,
                                   hidden_dim=nkerns[0],
                                   U=U,
                                   W=W,
                                   b=b,
                                   U_b=Ub,
                                   W_b=Wb,
                                   b_b=bb,
                                   bptt_truncate=-1)
    layer0_A1 = Bi_GRU_Matrix_Input(X=layer0_A1_input[:, left_A1:-right_A1],
                                    word_dim=emb_size,
                                    hidden_dim=nkerns[0],
                                    U=U,
                                    W=W,
                                    b=b,
                                    U_b=Ub,
                                    W_b=Wb,
                                    b_b=bb,
                                    bptt_truncate=-1)
    layer0_A2 = Bi_GRU_Matrix_Input(X=layer0_A2_input[:, left_A2:-right_A2],
                                    word_dim=emb_size,
                                    hidden_dim=nkerns[0],
                                    U=U,
                                    W=W,
                                    b=b,
                                    U_b=Ub,
                                    W_b=Wb,
                                    b_b=bb,
                                    bptt_truncate=-1)
    layer0_A3 = Bi_GRU_Matrix_Input(X=layer0_A3_input[:, left_A3:-right_A3],
                                    word_dim=emb_size,
                                    hidden_dim=nkerns[0],
                                    U=U,
                                    W=W,
                                    b=b,
                                    U_b=Ub,
                                    W_b=Wb,
                                    b_b=bb,
                                    bptt_truncate=-1)
    layer0_A4 = Bi_GRU_Matrix_Input(X=layer0_A4_input[:, left_A4:-right_A4],
                                    word_dim=emb_size,
                                    hidden_dim=nkerns[0],
                                    U=U,
                                    W=W,
                                    b=b,
                                    U_b=Ub,
                                    W_b=Wb,
                                    b_b=bb,
                                    bptt_truncate=-1)

    layer0_D_output = debug_print(layer0_D.output,
                                  'layer0_D.output')  # hidden*2
    layer0_Q_output = debug_print(layer0_Q.output_vector_last,
                                  'layer0_Q.output')  # hidden*4
    layer0_A1_output = debug_print(layer0_A1.output_vector_last,
                                   'layer0_A1.output')
    layer0_A2_output = debug_print(layer0_A2.output_vector_last,
                                   'layer0_A2.output')
    layer0_A3_output = debug_print(layer0_A3.output_vector_last,
                                   'layer0_A3.output')
    layer0_A4_output = debug_print(layer0_A4.output_vector_last,
                                   'layer0_A4.output')

    #before reasoning, do a GRU for doc: d
    U_d, W_d, b_d, U_db, W_db, b_db = create_Bi_GRU_para(
        rng, nkerns[0] * 2, nkerns[0] * 2)
    layer_d_para = [U_d, W_d, b_d, U_db, W_db, b_db]
    layer_D_GRU = Bi_GRU_Matrix_Input(X=layer0_D_output,
                                      word_dim=nkerns[0] * 2,
                                      hidden_dim=nkerns[0] * 2,
                                      U=U_d,
                                      W=W_d,
                                      b=b_d,
                                      U_b=U_db,
                                      W_b=W_db,
                                      b_b=b_db,
                                      bptt_truncate=-1)

    #Reasoning Layer 1
    repeat_Q = debug_print(
        T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0], 1)),
                 maxDocLength,
                 axis=1)[:, :layer_D_GRU.output_matrix.shape[1]], 'repeat_Q')
    input_DNN = debug_print(
        T.concatenate([layer_D_GRU.output_matrix, repeat_Q],
                      axis=0).transpose(),
        'input_DNN')  #each row is an example
    output_DNN1 = HiddenLayer(rng,
                              input=input_DNN,
                              n_in=nkerns[0] * 8,
                              n_out=nkerns[0])

    attention_W = create_ensemble_para(rng, nkerns[0], 1)
    attention_weights = T.nnet.softmax(
        T.dot(attention_W, output_DNN1.output.transpose()))
    repeat_attentions = T.repeat(attention_weights,
                                 layer_D_GRU.output_matrix.shape[0],
                                 axis=0)
    doc_r = T.sum(layer_D_GRU.output_matrix * repeat_attentions, axis=1)

    combine_DQ = T.concatenate([doc_r, layer0_Q_output],
                               axis=0)  # dim: hidden*6

    output_DNN2 = HiddenLayer(rng,
                              input=combine_DQ,
                              n_in=nkerns[0] * 8,
                              n_out=nkerns[0] * 4)

    #     DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out')
    #     U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0])
    #     layer_pooling_para=[U_p, W_p, b_p]
    #     pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1)
    #     translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1')
    #
    #
    #     #before reasoning, do a GRU for doc: d2
    #     U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0])
    #     layer_d2_para=[U_d2, W_d2, b_d2]
    #     layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1)
    #     #Reasoning Layer 2
    #     repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1')
    #     input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example
    #     output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0])
    #     output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0])
    #
    #     DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2')
    #     U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0])
    #     layer_pooling_para2=[U_p2, W_p2, b_p2]
    #     pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1)
    translated_Q2 = debug_print(output_DNN2.output, 'translated_Q2')

    QA1 = T.concatenate([translated_Q2, layer0_A1_output],
                        axis=0)  #dim: hidden*5
    QA2 = T.concatenate([translated_Q2, layer0_A2_output], axis=0)
    QA3 = T.concatenate([translated_Q2, layer0_A3_output], axis=0)
    QA4 = T.concatenate([translated_Q2, layer0_A4_output], axis=0)

    W_HL, b_HL = create_HiddenLayer_para(rng, n_in=nkerns[0] * 8, n_out=1)
    match_params = [W_HL, b_HL]
    QA1_match = HiddenLayer(rng,
                            input=QA1,
                            n_in=nkerns[0] * 8,
                            n_out=1,
                            W=W_HL,
                            b=b_HL)
    QA2_match = HiddenLayer(rng,
                            input=QA2,
                            n_in=nkerns[0] * 8,
                            n_out=1,
                            W=W_HL,
                            b=b_HL)
    QA3_match = HiddenLayer(rng,
                            input=QA3,
                            n_in=nkerns[0] * 8,
                            n_out=1,
                            W=W_HL,
                            b=b_HL)
    QA4_match = HiddenLayer(rng,
                            input=QA4,
                            n_in=nkerns[0] * 8,
                            n_out=1,
                            W=W_HL,
                            b=b_HL)

    #     simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1')
    #     simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2')
    #     simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3')
    #     simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4')

    simi_overall_level1 = debug_print(QA1_match.output[0],
                                      'simi_overall_level1')
    simi_overall_level2 = debug_print(QA2_match.output[0],
                                      'simi_overall_level2')
    simi_overall_level3 = debug_print(QA3_match.output[0],
                                      'simi_overall_level3')
    simi_overall_level4 = debug_print(QA4_match.output[0],
                                      'simi_overall_level4')

    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    #only use overall_simi
    cost = T.maximum(
        0.0, margin + simi_overall_level2 - simi_overall_level1) + T.maximum(
            0.0,
            margin + simi_overall_level3 - simi_overall_level1) + T.maximum(
                0.0, margin + simi_overall_level4 - simi_overall_level1)

    #     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    posi_simi = simi_overall_level1
    nega_simi = T.max(
        [simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #     #use ensembled simi
    #     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
    #     posi_simi=simi_1
    #     nega_simi=T.max([simi_2, simi_3, simi_4])

    L2_reg = debug_print(
        (U**2).sum() + (W**2).sum() + (Ub**2).sum() + (Wb**2).sum() +
        (output_DNN1.W**2).sum() + (output_DNN2.W**2).sum() + (U_d**2).sum() +
        (W_d**2).sum() + (U_db**2).sum() + (W_db**2).sum() + (W_HL**2).sum() +
        (attention_W**2).sum(), 'L2_reg'
    )  #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost = debug_print(cost + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        givens={
            index_D: test_data_D[index],  #a matrix
            index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
            index_A4: test_data_A4[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
            len_A4: test_Length_A4[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
            left_A4: test_leftPad_A4[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index],
            right_A4: test_rightPad_A4[index]
        },
        on_unused_input='ignore')

    params = layer0_para + output_DNN1.params + output_DNN2.params + match_params + layer_d_para + [
        attention_W
    ]

    #     accumulator=[]
    #     for para_i in params:
    #         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
    #         accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    #     updates = []
    #     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
    #         grad_i=debug_print(grad_i,'grad_i')
    #         acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop
    #         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6)))
    #         updates.append((acc_i, acc))

    def AdaDelta_updates(parameters, gradients, rho, eps):
        # create variables to store intermediate updates
        gradients_sq = [
            theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters
        ]
        deltas_sq = [
            theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters
        ]

        # calculates the new "average" delta for the next iteration
        gradients_sq_new = [
            rho * g_sq + (1 - rho) * (g**2)
            for g_sq, g in zip(gradients_sq, gradients)
        ]

        # calculates the step in direction. The square root is an approximation to getting the RMS for the average value
        deltas = [
            (T.sqrt(d_sq + eps) / T.sqrt(g_sq + eps)) * grad
            for d_sq, g_sq, grad in zip(deltas_sq, gradients_sq_new, gradients)
        ]

        # calculates the new "average" deltas for the next step.
        deltas_sq_new = [
            rho * d_sq + (1 - rho) * (d**2)
            for d_sq, d in zip(deltas_sq, deltas)
        ]

        # Prepare it as a list f
        gradient_sq_updates = zip(gradients_sq, gradients_sq_new)
        deltas_sq_updates = zip(deltas_sq, deltas_sq_new)
        parameters_updates = [(p, p - d) for p, d in zip(parameters, deltas)]
        return gradient_sq_updates + deltas_sq_updates + parameters_updates

    updates = AdaDelta_updates(params, grads, decay, 1e-6)

    train_model = theano.function(
        [index], [cost, posi_simi, nega_simi],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost, posi_simi, nega_simi],
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #         shuffle(train_batch_start)#shuffle training data

        corr_train = 0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            sys.stdout.write("Training :[%6f] %% complete!\r" %
                             ((iter % train_size) * 100.0 / train_size))
            sys.stdout.flush()

            minibatch_index = minibatch_index + 1

            cost_average, posi_simi, nega_simi = train_model(batch_start)
            if posi_simi > nega_simi:
                corr_train += 1

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + 'corr rate:' + str(
                            corr_train * 100.0 / train_size)

            if iter % validation_frequency == 0:
                corr_test = 0
                for i in test_batch_start:
                    cost, posi_simi, nega_simi = test_model(i)
                    if posi_simi > nega_simi:
                        corr_test += 1

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc = corr_test * 1.0 / test_size
                #test_acc=1-test_score
                print(
                    ('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') %
                    (epoch, minibatch_index, n_train_batches, test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                find_better = False
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')'
                if find_better == True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    L2_weight=0.000001,
                    drop_p=0.05,
                    emb_size=300,
                    hidden_size=500,
                    HL_hidden_size=500,
                    batch_size=5,
                    filter_size=[3, 5, 7],
                    maxSentLen=180,
                    comment=''):

    model_options = locals().copy()
    print "model options", model_options

    rng = np.random.RandomState(
        1234)  #random seed, control the model generates the same results
    srng = RandomStreams(rng.randint(999999))
    all_sentences, all_masks, all_labels, word2id = load_yelp_dataset(
        maxlen=maxSentLen, minlen=2
    )  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec_file('glove.840B.300d.txt')  #
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.ivector('labels')
    train_flag = T.iscalar()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM

    #     drop_common_input = dropout_layer(srng, common_input, drop_p, train_flag)

    bow = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1),
                axis=2)  #(batch, emb_size)

    gate_filter_shape = (emb_size, 1, emb_size, 1)
    conv_W_2_pre, conv_b_2_pre = create_conv_para(
        rng, filter_shape=gate_filter_shape)
    conv_W_2_gate, conv_b_2_gate = create_conv_para(
        rng, filter_shape=gate_filter_shape)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size, 1, emb_size,
                                                    filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size, 1, emb_size, 1))

    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size, 1, emb_size,
                                                      filter_size[1]))
    conv_W2_context, conv_b2_context = create_conv_para(
        rng, filter_shape=(hidden_size, 1, emb_size, 1))

    #     conv_W3, conv_b3=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size[2]))
    #     conv_W3_context, conv_b3_context=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 1))
    # conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
    soft_att_W_big, soft_att_b_big = create_HiddenLayer_para(
        rng, emb_size * 2, emb_size)
    soft_att_W_small, _ = create_HiddenLayer_para(rng, emb_size, 1)
    soft_att_W2_big, soft_att_b2_big = create_HiddenLayer_para(
        rng, emb_size * 2, emb_size)
    soft_att_W2_small, _ = create_HiddenLayer_para(rng, emb_size, 1)

    #     soft_att_W3_big, soft_att_b3_big = create_HiddenLayer_para(rng, emb_size*2, emb_size)
    #     soft_att_W3_small, _ = create_HiddenLayer_para(rng, emb_size, 1)

    NN_para = [
        conv_W_2_pre,
        conv_b_2_pre,
        conv_W_2_gate,
        conv_b_2_gate,
        conv_W,
        conv_b,
        conv_W_context,
        conv_W2,
        conv_b2,
        conv_W2_context,
        #              conv_W3, conv_b3,conv_W3_context,
        soft_att_W_big,
        soft_att_b_big,
        soft_att_W_small,
        soft_att_W2_big,
        soft_att_b2_big,
        soft_att_W2_small
        #              soft_att_W3_big, soft_att_b3_big,soft_att_W3_small
    ]  #,conv_W3, conv_b3,conv_W3_context]

    conv_layer_1_gate_l = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_2_pre,
        b=conv_b_2_pre,
        W_gate=conv_W_2_gate,
        b_gate=conv_b_2_gate)

    advanced_sent_tensor3 = conv_layer_1_gate_l.output_tensor3

    #     conv_layer_pair = Conv_for_Pair(rng,
    #             origin_input_tensor3=advanced_sent_tensor3,
    #             origin_input_tensor3_r = advanced_sent_tensor3,
    #             input_tensor3=advanced_sent_tensor3,
    #             input_tensor3_r = advanced_sent_tensor3,
    #              mask_matrix = sents_mask,
    #              mask_matrix_r = sents_mask,
    #              image_shape=(batch_size, 1, emb_size, maxSentLen),
    #              image_shape_r = (batch_size, 1, emb_size, maxSentLen),
    #              filter_shape=(hidden_size, 1, emb_size, filter_size[0]),
    #              filter_shape_context=(hidden_size, 1, emb_size, 1),
    #              W=conv_W, b=conv_b,
    #              W_context=conv_W_context, b_context=conv_b_context)

    conv_layer_pair = Conv_for_Pair_SoftAttend(
        rng,
        origin_input_tensor3=advanced_sent_tensor3,
        origin_input_tensor3_r=advanced_sent_tensor3,
        input_tensor3=advanced_sent_tensor3,
        input_tensor3_r=advanced_sent_tensor3,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        filter_shape=(hidden_size, 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size, 1, emb_size, 1),
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context,
        soft_att_W_big=soft_att_W_big,
        soft_att_b_big=soft_att_b_big,
        soft_att_W_small=soft_att_W_small)

    #     conv_layer_2_pair = Conv_for_Pair(rng,
    #             origin_input_tensor3=advanced_sent_tensor3,
    #             origin_input_tensor3_r = advanced_sent_tensor3,
    #             input_tensor3=advanced_sent_tensor3,
    #             input_tensor3_r = advanced_sent_tensor3,
    #              mask_matrix = sents_mask,
    #              mask_matrix_r = sents_mask,
    #              image_shape=(batch_size, 1, emb_size, maxSentLen),
    #              image_shape_r = (batch_size, 1, emb_size, maxSentLen),
    #              filter_shape=(hidden_size, 1, emb_size, filter_size[1]),
    #              filter_shape_context=(hidden_size, 1, emb_size, 1),
    #              W=conv_W2, b=conv_b2,
    #              W_context=conv_W2_context, b_context=conv_b2_context)
    conv_layer_2_pair = Conv_for_Pair_SoftAttend(
        rng,
        origin_input_tensor3=advanced_sent_tensor3,
        origin_input_tensor3_r=advanced_sent_tensor3,
        input_tensor3=advanced_sent_tensor3,
        input_tensor3_r=advanced_sent_tensor3,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        filter_shape=(hidden_size, 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size, 1, emb_size, 1),
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        W=conv_W2,
        b=conv_b2,
        W_context=conv_W2_context,
        b_context=conv_b2_context,
        soft_att_W_big=soft_att_W2_big,
        soft_att_b_big=soft_att_b2_big,
        soft_att_W_small=soft_att_W2_small)

    #     conv_layer_3_pair = Conv_for_Pair_SoftAttend(rng,
    #                                                origin_input_tensor3=advanced_sent_tensor3,
    #                                                origin_input_tensor3_r=advanced_sent_tensor3,
    #                                                input_tensor3=advanced_sent_tensor3,
    #                                                input_tensor3_r=advanced_sent_tensor3,
    #                                                mask_matrix=sents_mask,
    #                                                mask_matrix_r=sents_mask,
    #                                                filter_shape=(hidden_size, 1, emb_size, filter_size[2]),
    #                                                filter_shape_context=(hidden_size, 1, emb_size, 1),
    #                                                image_shape=(batch_size, 1, emb_size, maxSentLen),
    #                                                image_shape_r= (batch_size, 1, emb_size, maxSentLen),
    #                                                W=conv_W3, b=conv_b3,
    #                                                W_context=conv_W3_context, b_context=conv_b3_context,
    #                                                soft_att_W_big=soft_att_W3_big, soft_att_b_big=soft_att_b3_big,
    #                                                soft_att_W_small=soft_att_W3_small)

    # biased_sent_embeddings = conv_layer_pair.biased_attentive_maxpool_vec_l
    sent_embeddings = conv_layer_pair.maxpool_vec_l
    att_sent_embeddings = conv_layer_pair.attentive_maxpool_vec_l

    sent_embeddings_2 = conv_layer_2_pair.maxpool_vec_l
    att_sent_embeddings_2 = conv_layer_2_pair.attentive_maxpool_vec_l

    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    HL_input = T.concatenate(
        [
            bow, sent_embeddings, att_sent_embeddings, sent_embeddings_2,
            att_sent_embeddings_2
            #                               sent_embeddings_3,att_sent_embeddings_3,
        ],
        axis=1)
    HL_input_size = hidden_size * 4 + emb_size

    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, HL_input_size, HL_hidden_size)
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=HL_input,
                             n_in=HL_input_size,
                             n_out=HL_hidden_size,
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.nnet.relu)
    #     HL_layer_1_output = dropout_layer(srng, HL_layer_1.output, drop_p, train_flag)

    HL_layer_2_W, HL_layer_2_b = create_HiddenLayer_para(
        rng, HL_hidden_size, HL_hidden_size)
    HL_layer_2_params = [HL_layer_2_W, HL_layer_2_b]
    HL_layer_2 = HiddenLayer(rng,
                             input=HL_layer_1.output,
                             n_in=HL_hidden_size,
                             n_out=HL_hidden_size,
                             W=HL_layer_2_W,
                             b=HL_layer_2_b,
                             activation=T.nnet.relu)
    #     HL_layer_2_output = dropout_layer(srng, HL_layer_2.output, drop_p, train_flag)

    LR_input = T.concatenate([HL_input, HL_layer_1.output, HL_layer_2.output],
                             axis=1)
    #     drop_LR_input = dropout_layer(srng, LR_input, drop_p, train_flag)
    LR_input_size = HL_input_size + 2 * HL_hidden_size

    U_a = create_ensemble_para(
        rng, 5, LR_input_size)  # the weight matrix hidden_size*2
    #     norm_W_a = normalize_matrix(U_a)
    LR_b = theano.shared(value=np.zeros((5, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=5, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [
        embeddings
    ] + NN_para + HL_layer_1_params + HL_layer_2_params + LR_para  # put all model parameters together
    L2_reg = L2norm_paraList([
        embeddings, conv_W_2_pre, conv_W_2_gate, conv_W, conv_W_context,
        conv_W2, conv_W2_context, soft_att_W_big, soft_att_W_small,
        soft_att_W2_big, soft_att_W2_small, HL_layer_1_W, HL_layer_2_W, U_a
    ])
    #     diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix)

    cost = loss  #+L2_weight*L2_reg

    grads = T.grad(
        cost, params)  # create a list of gradients for all model parameters

    accumulator = []
    for para_i in params:
        eps_p = np.zeros_like(para_i.get_value(borrow=True),
                              dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i, param_i - learning_rate * grad_i /
             (T.sqrt(acc) + 1e-8)))  #1e-8 is add to get rid of zero division
        updates.append((acc_i, acc))

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function(
        [sents_id_matrix, sents_mask, labels, train_flag],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_id_matrix, sents_mask, labels, train_flag],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, labels, train_flag],
        [layer_LR.errors(labels), layer_LR.y_pred],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    cost_i = 0.0
    train_indices = range(train_size)
    while epoch < n_epochs:
        epoch = epoch + 1
        # combined = zip(train_sents, train_masks, train_labels)
        random.Random(200).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch], 1)

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 2000 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                # writefile=open('log.'+nn+'.senti.preditions.txt', 'w')
                for test_batch_id in test_batch_start:  # for each test batch
                    error_i, pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        test_labels[test_batch_id:test_batch_id + batch_size],
                        0)
                    # pred_labels=list(pred_labels)
                    # if test_batch_id !=test_batch_start[-1]:
                    #     writefile.write('\n'.join(map(str,pred_labels))+'\n')
                    # else:
                    #     writefile.write('\n'.join(map(str,pred_labels[-test_size%batch_size:])))

                    error_sum += error_i
                # writefile.close()
                test_accuracy = 1.0 - error_sum / (len(test_batch_start))
                if test_accuracy > max_acc_test:
                    max_acc_test = test_accuracy
                print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test