Example #1
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=4,
                    emb_size=300,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=200,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_concMT_BBN_NI_epoch4.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_lines, word2id = load_official_testData_il_and_MT(
        word2id, maxSentLen, test_file_path)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    # test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + '100k-ENG-multicca.300.ENG.txt',
        emb_root + '100k-IL9-multicca.d300.IL9.txt'
    ], 300)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        [binarize_prob, ensemble_scores, sum_tensor3],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    # max_meanf1_test=0.0
    # max_weightf1_test=0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_types = []
                pred_confs = []
                pred_others = []
                for i, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    pred_types_i, pred_conf_i, pred_fields_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    if i < len(test_batch_start) - 1:
                        pred_types.append(pred_types_i)
                        pred_confs.append(pred_conf_i)
                        pred_others.append(pred_fields_i)
                    else:
                        pred_types.append(pred_types_i[-n_test_remain:])
                        pred_confs.append(pred_conf_i[-n_test_remain:])
                        pred_others.append(pred_fields_i[-n_test_remain:])
                pred_types = np.concatenate(pred_types, axis=0)
                pred_confs = np.concatenate(pred_confs, axis=0)
                pred_others = np.concatenate(pred_others, axis=0)
                mean_frame = generate_2018_official_output(
                    test_lines, output_file_path, pred_types, pred_confs,
                    pred_others, min_mean_frame)
                if mean_frame < min_mean_frame:
                    min_mean_frame = mean_frame
                print '\t\t\t test  over, min_mean_frame:', min_mean_frame

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=10,
                    filter_size=[3, 5],
                    maxSentLen=40,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    NN_para = [conv_W, conv_b, conv_W2, conv_b2]
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2], axis=1)
    LR_input_size = hidden_size[0] * 2
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 12, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))

    # loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [embeddings
              ] + NN_para + LR_para  # put all model parameters together
    cost = loss  #+Div_reg*diversify_reg#+L2_weight*L2_reg
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''
    binarize_prob = T.where(score_matrix > 0.5, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_id_matrix, sents_mask, labels],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask],
                                 binarize_prob,
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0
        cost_i = 0.0
        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size])
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]

                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=0.0000001,
                    extra_size=4,
                    drop_p=0.0,
                    div_weight=0.00001,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    comment='only one HL layer, drop0.0'):

    model_options = locals().copy()
    print "model options", model_options

    first_seeds = [1234, 1235, 1236, 1237]  #first copy starts by 1
    first_rngs = [
        np.random.RandomState(first_seeds[0]),
        np.random.RandomState(first_seeds[1]),
        np.random.RandomState(first_seeds[2]),
        np.random.RandomState(first_seeds[3])
    ]  #random seed, control the model generates the same results
    first_srng = RandomStreams(first_rngs[0].randint(999999))

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_extra, all_labels, word2id, test_rows = load_SNLI_dataset_with_extra_with_test(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    #     train_sents_l = np.concatenate((train_sents_l, dev_sents_l), axis=0)
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    #     train_masks_l = np.concatenate((train_masks_l, dev_masks_l), axis=0)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    #     train_sents_r = np.concatenate((train_sents_r, dev_sents_r), axis=0)
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    #     train_masks_r = np.concatenate((train_masks_r, dev_masks_r), axis=0)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_extra = np.asarray(all_extra[0], dtype=theano.config.floatX)
    dev_extra = np.asarray(all_extra[1], dtype=theano.config.floatX)
    test_extra = np.asarray(all_extra[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    #     train_labels_store = np.concatenate((train_labels_store, dev_labels_store), axis=0)
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = first_rngs[0].normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    #     word2vec =extend_word2vec_lowercase(word2vec)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    first_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    train_flag = T.iscalar()
    first_sents_ids_l = T.imatrix()
    first_sents_mask_l = T.fmatrix()
    first_sents_ids_r = T.imatrix()
    first_sents_mask_r = T.fmatrix()
    first_labels = T.ivector()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def common_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    first_common_input_l = dropout_layer(
        first_srng, common_input(first_embeddings,
                                 first_sents_ids_l), drop_p, train_flag
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    first_common_input_r = dropout_layer(
        first_srng, common_input(first_embeddings,
                                 first_sents_ids_r), drop_p, train_flag
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    gate_filter_shape = (hidden_size[0], 1, emb_size, 1)

    def create_CNN_params(rng):
        conv_W_2_pre, conv_b_2_pre = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2_gate, conv_b_2_gate = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2, conv_b_2 = create_conv_para(rng,
                                              filter_shape=(hidden_size[1], 1,
                                                            hidden_size[0],
                                                            filter_size[0]))
        conv_W_2_context, conv_b_2_context = create_conv_para(
            rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))
        return conv_W_2_pre, conv_b_2_pre, conv_W_2_gate, conv_b_2_gate, conv_W_2, conv_b_2, conv_W_2_context, conv_b_2_context

    first_conv_W_pre, first_conv_b_pre, first_conv_W_gate, first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context, first_conv_b_context = create_CNN_params(
        first_rngs[0])
    first_2_conv_W_pre, first_2_conv_b_pre, first_2_conv_W_gate, first_2_conv_b_gate, first_2_conv_W, first_2_conv_b, first_2_conv_W_context, first_2_conv_b_context = create_CNN_params(
        first_rngs[1])
    first_3_conv_W_pre, first_3_conv_b_pre, first_3_conv_W_gate, first_3_conv_b_gate, first_3_conv_W, first_3_conv_b, first_3_conv_W_context, first_3_conv_b_context = create_CNN_params(
        first_rngs[2])
    first_4_conv_W_pre, first_4_conv_b_pre, first_4_conv_W_gate, first_4_conv_b_gate, first_4_conv_W, first_4_conv_b, first_4_conv_W_context, first_4_conv_b_context = create_CNN_params(
        first_rngs[3])
    # first_5_conv_W_pre, first_5_conv_b_pre,first_5_conv_W_gate, first_5_conv_b_gate,first_5_conv_W, first_5_conv_b,first_5_conv_W_context, first_5_conv_b_context = create_CNN_params(first_rng)
    # first_6_conv_W_pre, first_6_conv_b_pre,first_6_conv_W_gate, first_6_conv_b_gate,first_6_conv_W, first_6_conv_b,first_6_conv_W_context, first_6_conv_b_context = create_CNN_params(first_rng)

    first_CNN_1_para = [
        first_conv_W_pre, first_conv_b_pre, first_conv_W_gate,
        first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context
    ]
    first_CNN_2_para = [
        first_2_conv_W_pre, first_2_conv_b_pre, first_2_conv_W_gate,
        first_2_conv_b_gate, first_2_conv_W, first_2_conv_b,
        first_2_conv_W_context
    ]
    first_CNN_3_para = [
        first_3_conv_W_pre, first_3_conv_b_pre, first_3_conv_W_gate,
        first_3_conv_b_gate, first_3_conv_W, first_3_conv_b,
        first_3_conv_W_context
    ]
    first_CNN_4_para = [
        first_4_conv_W_pre, first_4_conv_b_pre, first_4_conv_W_gate,
        first_4_conv_b_gate, first_4_conv_W, first_4_conv_b,
        first_4_conv_W_context
    ]
    '''
    first copy
    '''

    def copy(
        rngs,
        common_input_l,
        common_input_r,
        sents_mask_l,
        sents_mask_r,
        drop_conv_W_1_pre,
        conv_b_1_pre,
        drop_conv_W_1_gate,
        conv_b_1_gate,
        drop_conv_W_1,
        conv_b_1,
        drop_conv_W_1_context,
        conv_b_1_context,
        drop_conv_W_1_pre_2,
        conv_b_1_pre_2,
        drop_conv_W_1_gate_2,
        conv_b_1_gate_2,
        drop_conv_W_1_2,
        conv_b_1_2,
        drop_conv_W_1_context_2,
        conv_b_1_context_2,
        drop_conv_W_1_pre_3,
        conv_b_1_pre_3,
        drop_conv_W_1_gate_3,
        conv_b_1_gate_3,
        drop_conv_W_1_3,
        conv_b_1_3,
        drop_conv_W_1_context_3,
        conv_b_1_context_3,
        drop_conv_W_1_pre_4,
        conv_b_1_pre_4,
        drop_conv_W_1_gate_4,
        conv_b_1_gate_4,
        drop_conv_W_1_4,
        conv_b_1_4,
        drop_conv_W_1_context_4,
        conv_b_1_context_4,

        #    drop_conv_W_1_pre_5,conv_b_1_pre_5,drop_conv_W_1_gate_5,conv_b_1_gate_5,
        #    drop_conv_W_1_5,conv_b_1_5,drop_conv_W_1_context_5, conv_b_1_context_5,
        #
        #    drop_conv_W_1_pre_6,conv_b_1_pre_6,drop_conv_W_1_gate_6,conv_b_1_gate_6,
        #    drop_conv_W_1_6,conv_b_1_6,drop_conv_W_1_context_6, conv_b_1_context_6,
        labels):

        loss_0_0, distr_0_0, params_0_0 = one_classifier_in_one_copy(
            rngs[0], common_input_l, common_input_r, sents_mask_l,
            sents_mask_r, batch_size, emb_size, maxSentLen, gate_filter_shape,
            hidden_size, filter_size, first_srng, drop_p, train_flag, labels,
            drop_conv_W_1_pre, conv_b_1_pre, drop_conv_W_1_gate, conv_b_1_gate,
            drop_conv_W_1, conv_b_1, drop_conv_W_1_context, conv_b_1_context,
            True)
        loss_0_1, distr_0_1, params_0_1 = one_classifier_in_one_copy(
            rngs[1], common_input_l, common_input_r, sents_mask_l,
            sents_mask_r, batch_size, emb_size, maxSentLen, gate_filter_shape,
            hidden_size, filter_size, first_srng, drop_p, train_flag, labels,
            drop_conv_W_1_pre_2, conv_b_1_pre_2, drop_conv_W_1_gate_2,
            conv_b_1_gate_2, drop_conv_W_1_2, conv_b_1_2,
            drop_conv_W_1_context_2, conv_b_1_context_2, False)
        loss_0_2, distr_0_2, params_0_2 = one_classifier_in_one_copy(
            rngs[2], common_input_l, common_input_r, sents_mask_l,
            sents_mask_r, batch_size, emb_size, maxSentLen, gate_filter_shape,
            hidden_size, filter_size, first_srng, drop_p, train_flag, labels,
            drop_conv_W_1_pre_3, conv_b_1_pre_3, drop_conv_W_1_gate_3,
            conv_b_1_gate_3, drop_conv_W_1_3, conv_b_1_3,
            drop_conv_W_1_context_3, conv_b_1_context_3, True)
        loss_0_3, distr_0_3, params_0_3 = one_classifier_in_one_copy(
            rngs[3], common_input_l, common_input_r, sents_mask_l,
            sents_mask_r, batch_size, emb_size, maxSentLen, gate_filter_shape,
            hidden_size, filter_size, first_srng, drop_p, train_flag, labels,
            drop_conv_W_1_pre_4, conv_b_1_pre_4, drop_conv_W_1_gate_4,
            conv_b_1_gate_4, drop_conv_W_1_4, conv_b_1_4,
            drop_conv_W_1_context_4, conv_b_1_context_4, False)
        # loss_0_4, distr_0_4, params_0_4 = one_classifier_in_one_copy(rng, common_input_l,common_input_r,sents_mask_l,sents_mask_r,batch_size, emb_size,
        #                                                              maxSentLen,gate_filter_shape,hidden_size,filter_size,
        #                        first_srng, drop_p,train_flag,labels,
        #                        drop_conv_W_1_pre_5,conv_b_1_pre_5,drop_conv_W_1_gate_5,conv_b_1_gate_5,
        #                        drop_conv_W_1_5,conv_b_1_5,drop_conv_W_1_context_5,conv_b_1_context_5,
        #                        True)
        # loss_0_5, distr_0_5, params_0_5 = one_classifier_in_one_copy(rng, common_input_l,common_input_r,sents_mask_l,sents_mask_r,batch_size, emb_size,
        #                                                              maxSentLen,gate_filter_shape,hidden_size,filter_size,
        #                        first_srng, drop_p,train_flag,labels,
        #                        drop_conv_W_1_pre_6,conv_b_1_pre_6,drop_conv_W_1_gate_6,conv_b_1_gate_6,
        #                        drop_conv_W_1_6,conv_b_1_6,drop_conv_W_1_context_6,conv_b_1_context_6,
        #                        False)

        #         psp_label = T.repeat(labels, multi_psp_size)

        loss_0 = [(loss_0_0 + loss_0_1 + loss_0_2 + loss_0_3) / 4.0, loss_0_0,
                  loss_0_1, loss_0_2, loss_0_3]  #5
        para_0 = [params_0_0, params_0_1, params_0_2, params_0_3]

        #         loss = loss_0+loss_1+loss_2
        batch_distr = distr_0_0 + distr_0_1 + distr_0_2 + distr_0_3  #T.sum((layer_LR.p_y_given_x).reshape((batch_size, multi_psp_size,3)), axis=1)  #(batch, 3)

        return loss_0, para_0, batch_distr

    first_loss, first_classifier_params, first_test_distr = copy(
        first_rngs,
        first_common_input_l,
        first_common_input_r,
        first_sents_mask_l,
        first_sents_mask_r,
        first_conv_W_pre,
        first_conv_b_pre,
        first_conv_W_gate,
        first_conv_b_gate,
        first_conv_W,
        first_conv_b,
        first_conv_W_context,
        first_conv_b_context,
        first_2_conv_W_pre,
        first_2_conv_b_pre,
        first_2_conv_W_gate,
        first_2_conv_b_gate,
        first_2_conv_W,
        first_2_conv_b,
        first_2_conv_W_context,
        first_2_conv_b_context,
        first_3_conv_W_pre,
        first_3_conv_b_pre,
        first_3_conv_W_gate,
        first_3_conv_b_gate,
        first_3_conv_W,
        first_3_conv_b,
        first_3_conv_W_context,
        first_3_conv_b_context,
        first_4_conv_W_pre,
        first_4_conv_b_pre,
        first_4_conv_W_gate,
        first_4_conv_b_gate,
        first_4_conv_W,
        first_4_conv_b,
        first_4_conv_W_context,
        first_4_conv_b_context,

        # drop_first_5_conv_W_pre,first_5_conv_b_pre,drop_first_5_conv_W_gate,first_5_conv_b_gate,
        # drop_first_5_conv_W,first_5_conv_b,drop_first_5_conv_W_context,first_5_conv_b_context,
        #
        # drop_first_6_conv_W_pre,first_6_conv_b_pre,drop_first_6_conv_W_gate,first_6_conv_b_gate,
        # drop_first_6_conv_W,first_6_conv_b,drop_first_6_conv_W_context,first_6_conv_b_context,
        first_labels)

    first_preds = T.argmax(first_test_distr, axis=1)  #batch
    all_error = T.mean(T.neq(first_preds, first_labels))

    #     neg_labels = T.where( labels < 2, 2, labels-1)
    #     loss2=-T.mean(T.log(1.0/(1.0+layer_LR.p_y_given_x))[T.arange(neg_labels.shape[0]), neg_labels])

    # rank loss
    # entail_prob_batch = T.nnet.softmax(layer_LR.before_softmax.T)[2] #batch
    # entail_ids = elementwise_is_two(labels)
    # entail_probs = entail_prob_batch[entail_ids.nonzero()]
    # non_entail_probs = entail_prob_batch[(1-entail_ids).nonzero()]
    #
    # repeat_entail = T.extra_ops.repeat(entail_probs, non_entail_probs.shape[0], axis=0)
    # repeat_non_entail = T.extra_ops.repeat(non_entail_probs.dimshuffle('x',0), entail_probs.shape[0], axis=0).flatten()
    # loss2 = -T.mean(T.log(entail_probs))#T.mean(T.maximum(0.0, margin-repeat_entail+repeat_non_entail))

    # zero_matrix = T.zeros((batch_size, 3))
    # filled_zero_matrix = T.set_subtensor(zero_matrix[T.arange(batch_size), labels], 1.0)
    # prob_batch_posi = layer_LR.p_y_given_x[filled_zero_matrix.nonzero()]
    # prob_batch_nega = layer_LR.p_y_given_x[(1-filled_zero_matrix).nonzero()]
    #
    # repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0)
    # repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten()
    # loss2 = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega))

    first_common_para = [first_embeddings]
    first_classifier_1_para = first_CNN_1_para + first_classifier_params[0]
    first_classifier_2_para = first_CNN_2_para + first_classifier_params[1]
    first_classifier_3_para = first_CNN_3_para + first_classifier_params[2]
    first_classifier_4_para = first_CNN_4_para + first_classifier_params[3]

    first_common_updates = Gradient_Cost_Para(first_loss[0], first_common_para,
                                              learning_rate)
    first_classifier_1_updates = Gradient_Cost_Para(first_loss[1],
                                                    first_classifier_1_para,
                                                    learning_rate)
    first_classifier_2_updates = Gradient_Cost_Para(first_loss[2],
                                                    first_classifier_2_para,
                                                    learning_rate)
    first_classifier_3_updates = Gradient_Cost_Para(first_loss[3],
                                                    first_classifier_3_para,
                                                    learning_rate)
    first_classifier_4_updates = Gradient_Cost_Para(first_loss[4],
                                                    first_classifier_4_para,
                                                    learning_rate)

    cost = first_loss[0]

    first_updates = first_common_updates + first_classifier_1_updates + first_classifier_2_updates + first_classifier_3_updates + first_classifier_4_updates

    updates = first_updates

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([
        train_flag,
        first_sents_ids_l,
        first_sents_mask_l,
        first_sents_ids_r,
        first_sents_mask_r,
        first_labels,
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    #     train_model_pred = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra,labels,
    #                                         second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], [LR_input, labels], allow_input_downcast=True, on_unused_input='ignore')
    #
    #     dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra, labels,
    #                                  second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([
        train_flag,
        first_sents_ids_l,
        first_sents_mask_l,
        first_sents_ids_r,
        first_sents_mask_r,
        first_labels,
    ], [all_error, first_preds],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    gold_test_rows = test_rows[:(n_test_batches *
                                 batch_size)] + test_rows[-batch_size:]

    max_acc_test = 0.0

    cost_i = 0.0
    first_train_indices = range(train_size)
    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(200).shuffle(
            first_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            first_train_id_batch = first_train_indices[batch_id:batch_id +
                                                       batch_size]
            cost_i += train_model(1, train_sents_l[first_train_id_batch],
                                  train_masks_l[first_train_id_batch],
                                  train_sents_r[first_train_id_batch],
                                  train_masks_r[first_train_id_batch],
                                  train_labels_store[first_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % int(2000 * (50.0 / batch_size)) == 0:
                #             if iter%int(200*(50.0 / batch_size))==0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0:
                #     print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                #     past_time = time.time()

                #                 error_sum=0.0
                #                 for dev_batch_id in dev_batch_start: # for each test batch
                #                     error_i=dev_model(
                #                                 dev_sents_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_sents_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_labels_store[dev_batch_id:dev_batch_id+batch_size]
                #                                 )
                #
                #                     error_sum+=error_i
                #                 dev_accuracy=1.0-error_sum/(len(dev_batch_start))
                #                 if dev_accuracy > max_acc_dev:
                #                     max_acc_dev=dev_accuracy
                #                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev
                #best dev model, do test
                #                 error_sum_1=0.0
                #                 error_sum_2=0.0
                error_sum_comb = 0.0
                pred_ys = []
                for test_batch_id in test_batch_start:  # for each test batch
                    error_comb, pred_ys_batch = test_model(
                        0,
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    #                     error_sum_1+=error_1
                    #                     error_sum_2+=error_2
                    error_sum_comb += error_comb
                    pred_ys += list(pred_ys_batch)

#                 test_acc_1=1.0-error_sum_1/(len(test_batch_start))
#                 test_acc_2=1.0-error_sum_2/(len(test_batch_start))
                test_acc_comb = 1.0 - error_sum_comb / (len(test_batch_start))

                #                 if test_acc_1 > max_acc_test:
                #                     max_acc_test=test_acc_1
                #                 if test_acc_2 > max_acc_test:
                #                     max_acc_test=test_acc_2
                if test_acc_comb > max_acc_test:
                    max_acc_test = test_acc_comb
                    #                     store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_single_model_'+str(max_acc_test), params)

                    if len(pred_ys) != len(gold_test_rows):
                        print 'len(pred_ys)!=len(gold_test_rows):', len(
                            pred_ys), len(gold_test_rows)
                    else:
                        test_write = open(
                            '/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/error_analysis_'
                            + str(max_acc_test) + '.txt', 'w')
                        for i in range(len(pred_ys)):
                            test_write.write(
                                str(pred_ys[i]) + '\t' + gold_test_rows[i] +
                                '\n')
                        print 'error analysis file written over.'
                        test_write.close()

                print '\t\tcurrent acc:', test_acc_comb, '\t\t\t\t\tmax_acc:', max_acc_test


#                 else:
#                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Example #4
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=700,
                    il5_emb_size=300,
                    train_ratio=0.8,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 5],
                    max_il5_phrase_len=5,
                    hidden_size=300):
    model_options = locals().copy()
    print "model options", model_options
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    # srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    # en_word2vec=load_fasttext_word2vec_given_file(root+'wiki.en.vec') #
    en_word2vec = load_fasttext_word2vec_given_file(
        '/save/wenpeng/datasets/word2vec_words_300d.txt', 300)
    il5_word2vec = load_fasttext_word2vec_given_file(
        root + 'il5_300d_word2vec.txt', 300)

    english_vocab = set(en_word2vec.keys())
    il5_vocab = set(il5_word2vec.keys())

    source_ids, source_masks, target_ids, il5_word2id, english_word2id = load_trainingdata_il5(
        root, english_vocab, il5_vocab, max_il5_phrase_len)
    # print len(english_vocab)
    # print len(english_word2id)
    # print set(english_word2id.keys()) - english_vocab
    assert set(english_word2id.keys()).issubset(english_vocab)
    assert set(il5_word2id.keys()).issubset(il5_vocab)

    data_size = len(target_ids)
    train_size = int(data_size * train_ratio)
    dev_size = data_size - train_size
    print 'trainin size: ', train_size, ' dev_size: ', dev_size

    # all_sentences, all_masks, all_labels, word2id=load_BBN_multi_labels_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(source_ids[:train_size], dtype='int32')
    train_masks = np.asarray(source_masks[:train_size],
                             dtype=theano.config.floatX)
    train_target = np.asarray(target_ids[:train_size], dtype='int32')

    dev_sents = np.asarray(source_ids[-dev_size:], dtype='int32')
    dev_masks = np.asarray(source_masks[-dev_size:],
                           dtype=theano.config.floatX)
    dev_target = np.asarray(target_ids[-dev_size:], dtype='int32')

    en_vocab_size = len(english_word2id)
    en_rand_values = rng.normal(
        0.0, 0.01,
        (en_vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    en_id2word = {y: x for x, y in english_word2id.iteritems()}
    en_rand_values = load_word2vec_to_init(en_rand_values, en_id2word,
                                           en_word2vec)
    en_embeddings = theano.shared(
        value=np.array(en_rand_values, dtype=theano.config.floatX),
        borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    il5_vocab_size = len(il5_word2id) + 1  # add one zero pad index
    il5_rand_values = rng.normal(
        0.0, 0.01, (il5_vocab_size,
                    il5_emb_size))  #generate a matrix by Gaussian distribution
    il5_id2word = {y: x for x, y in il5_word2id.iteritems()}
    il5_rand_values = load_word2vec_to_init(il5_rand_values, il5_id2word,
                                            il5_word2vec)
    il5_embeddings = theano.shared(
        value=np.array(il5_rand_values, dtype=theano.config.floatX),
        borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    # source_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
    # target_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    batch_ids = T.imatrix()  #(batch, maxlen)
    batch_masks = T.fmatrix()  #(batch, maxlen)
    batch_targets = T.ivector()
    batch_test_source = T.fmatrix()  #(batch, emb_size)

    input_batch = il5_embeddings[batch_ids.flatten()].reshape(
        (batch_size, max_il5_phrase_len, il5_emb_size))  #(batch, emb_size)
    masked_input_batch = T.sum(input_batch * batch_masks.dimshuffle(0, 1, 'x'),
                               axis=1)  #(batch, emb_size)
    # masked_input_batch = masked_input_batch / T.sum(batch_masks,axis=1).dimshuffle(0,'x')

    target_embs = en_embeddings[batch_targets]

    HL_layer_W1, HL_layer_b1 = create_HiddenLayer_para(rng, hidden_size,
                                                       hidden_size)
    HL_layer_W2, HL_layer_b2 = create_HiddenLayer_para(rng, hidden_size,
                                                       hidden_size)
    HL_layer_W3, HL_layer_b3 = create_HiddenLayer_para(rng, hidden_size,
                                                       hidden_size)
    HL_layer_params = [
        HL_layer_W3, HL_layer_b3
    ]  #][HL_layer_W1, HL_layer_b1, HL_layer_W2, HL_layer_b2, HL_layer_W3, HL_layer_b3]
    #doc, by pos
    # HL_layer_1=HiddenLayer(rng, input=masked_input_batch, n_in=il5_emb_size, n_out=emb_size, W=HL_layer_W1, b=HL_layer_b1, activation=T.nnet.relu)
    # HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=emb_size, n_out=emb_size, W=HL_layer_W2, b=HL_layer_b2, activation=T.nnet.relu)
    HL_layer_3 = HiddenLayer(rng,
                             input=masked_input_batch,
                             n_in=emb_size,
                             n_out=emb_size,
                             W=HL_layer_W3,
                             b=HL_layer_b3,
                             activation=T.tanh)

    batch_raw_pred = HL_layer_3.output
    # batch_pred = batch_raw_pred/T.sqrt(1e-8+T.sum(batch_raw_pred**2, axis=1)).dimshuffle(0,'x')
    batch_cosine = T.mean(
        cosine_row_wise_twoMatrix(
            batch_raw_pred,
            target_embs))  #T.mean(T.sum(batch_pred *target_embs, axis=1))
    batch_distance = T.mean(
        T.sqrt(1e-8 + T.sum((batch_raw_pred - target_embs)**2, axis=1)))
    cos_loss = -T.log(1.0 + batch_cosine)  #1.0-batch_cosine
    loss = -T.log(1.0 / (1.0 + batch_distance))

    params = HL_layer_params  # put all model parameters together
    cost = cos_loss + loss  #+1e-4*((HL_layer_W3**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([batch_ids, batch_masks, batch_targets],
                                  loss,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([batch_ids, batch_masks, batch_targets],
                                 batch_cosine,
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = dev_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [dev_size - batch_size]

    train_indices = range(train_size)
    cost_i = 0.0
    max_cosine = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_target[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                test_loss = 0.0
                for test_batch_id in test_batch_start:  # for each test batch
                    test_loss_i = test_model(
                        dev_sents[test_batch_id:test_batch_id + batch_size],
                        dev_masks[test_batch_id:test_batch_id + batch_size],
                        dev_target[test_batch_id:test_batch_id + batch_size])

                    test_loss += test_loss_i
                test_loss /= len(test_batch_start)

                if test_loss > max_cosine:
                    max_cosine = test_loss
                print '\t\t\t\t\t\t\t\tcurrent mean_cosin:', test_loss, ' max cosine: ', max_cosine

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #5
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=10,
                    L2_weight=0.000001,
                    extra_size=4,
                    emb_size=300,
                    posi_emb_size=50,
                    batch_size=50,
                    filter_size=[3, 3],
                    maxSentLen=50,
                    hidden_size=300):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SciTailV1_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)

    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    posi_rand_values = rng.normal(
        0.0, 0.01,
        (maxSentLen,
         posi_emb_size))  #generate a matrix by Gaussian distribution
    posi_embeddings = theano.shared(
        value=np.array(posi_rand_values, dtype=theano.config.floatX),
        borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size, 1, emb_size,
                                                    filter_size[0]))
    conv_W_posi, conv_b_posi = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1, emb_size + posi_emb_size,
                      filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size, 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_posi, conv_b_posi, conv_W_context]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size, 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size, 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_posi=conv_W_posi,
        b_posi=conv_b_posi,
        W_context=conv_W_context,
        b_context=conv_b_context,
        posi_emb_matrix=posi_embeddings,
        posi_emb_size=posi_emb_size)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    sent_embeddings_l = attentive_conv_layer.maxpool_vec_l
    sent_embeddings_r = attentive_conv_layer.maxpool_vec_r

    "form input to LR classifier"
    LR_input = T.concatenate([
        sent_embeddings_l, sent_embeddings_r,
        sent_embeddings_l * sent_embeddings_r, attentive_sent_embeddings_l,
        attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = 6 * hidden_size

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [init_embeddings, posi_embeddings] + NN_para + LR_para
    # L2_reg = (init_embeddings**2).sum()+(conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()

    cost = loss  #+L2_weight*L2_reg

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                dev_error_sum = 0.0
                for dev_batch_id in dev_batch_start:  # for each test batch
                    dev_error_i = dev_model(
                        dev_sents_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_sents_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_labels_store[dev_batch_id:dev_batch_id +
                                         batch_size])

                    dev_error_sum += dev_error_i
                dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start))

                if dev_acc > max_acc_dev:
                    max_acc_dev = dev_acc
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

                    error_sum = 0.0
                    for idd, test_batch_id in enumerate(
                            test_batch_start):  # for each test batch
                        error_i = test_model(
                            test_sents_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_sents_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_labels_store[test_batch_id:test_batch_id +
                                              batch_size])

                        error_sum += error_i
                    test_acc = 1.0 - error_sum / (len(test_batch_start))
                    if test_acc > max_acc_test:
                        max_acc_test = test_acc
                        store_model_to_file(
                            '/home/wenpeng/workspace/SciTail/src/model_para_' +
                            str(max_acc_test), params)
                    print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test
                else:
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3],
                    sent_len=40,
                    claim_len=40,
                    cand_size=10,
                    hidden_size=[300, 300],
                    max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train(
        sent_len, claim_len, cand_size)
    train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev(
        sent_len, claim_len, cand_size, word2id)
    test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)

    train_sents = np.asarray(train_sents, dtype='int32')
    train_3th_sents = np.asarray(train_3th_sents, dtype='int32')
    joint_train_sents = np.concatenate((train_sents, train_3th_sents))
    test_sents = np.asarray(test_sents, dtype='int32')
    test_3th_sents = np.asarray(test_3th_sents, dtype='int32')
    joint_test_sents = np.concatenate((test_sents, test_3th_sents))

    train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX)
    train_3th_sent_masks = np.asarray(train_3th_sent_masks,
                                      dtype=theano.config.floatX)
    joint_train_sent_masks = np.concatenate(
        (train_sent_masks, train_3th_sent_masks))
    test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX)
    test_3th_sent_masks = np.asarray(test_3th_sent_masks,
                                     dtype=theano.config.floatX)
    joint_test_sent_masks = np.concatenate(
        (test_sent_masks, test_3th_sent_masks))

    train_sent_labels = np.asarray(train_sent_labels, dtype='int32')
    train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32')
    joint_train_sent_labels = np.concatenate(
        (train_sent_labels, train_3th_sent_labels))
    test_sent_labels = np.asarray(test_sent_labels, dtype='int32')
    test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32')
    joint_test_sent_labels = np.concatenate(
        (test_sent_labels, test_3th_sent_labels))

    train_claims = np.asarray(train_claims, dtype='int32')
    train_3th_claims = np.asarray(train_3th_claims, dtype='int32')
    joint_train_claims = np.concatenate((train_claims, train_3th_claims))
    test_claims = np.asarray(test_claims, dtype='int32')
    test_3th_claims = np.asarray(test_3th_claims, dtype='int32')
    joint_test_claims = np.concatenate((test_claims, test_3th_claims))

    train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX)
    train_3th_claim_mask = np.asarray(train_3th_claim_mask,
                                      dtype=theano.config.floatX)
    joint_train_claim_mask = np.concatenate(
        (train_claim_mask, train_3th_claim_mask))
    test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX)
    test_3th_claim_mask = np.asarray(test_3th_claim_mask,
                                     dtype=theano.config.floatX)
    joint_test_claim_mask = np.concatenate(
        (test_claim_mask, test_3th_claim_mask))

    train_labels = np.asarray(train_labels, dtype='int32')
    train_3th_labels = np.asarray(train_3th_labels, dtype='int32')
    joint_train_labels = np.concatenate((train_labels, train_3th_labels))
    test_labels = np.asarray(test_labels, dtype='int32')
    test_3th_labels = np.asarray(test_3th_labels, dtype='int32')
    joint_test_labels = np.concatenate((test_labels, test_3th_labels))

    joint_train_size = len(joint_train_claims)
    joint_test_size = len(joint_test_claims)
    train_size = len(train_claims)
    test_size = len(test_claims)
    test_3th_size = len(test_3th_claims)
    vocab_size = len(word2id) + 1
    print 'joint_train size: ', joint_train_size, ' joint_test size: ', joint_test_size
    print 'train size: ', train_size, ' test size: ', test_size
    print 'vocab size: ', vocab_size

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    sents_mask = T.ftensor3()
    sents_labels = T.imatrix()  #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()

    joint_sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    joint_sents_mask = T.ftensor3()
    joint_sents_labels = T.imatrix()  #(batch, cand_size)
    joint_claim_ids = T.imatrix()  #(batch, claim_len)
    joint_claim_mask = T.fmatrix()
    joint_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, att_conv_W, att_conv_b, conv_W_context]

    conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_sents,
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings = conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1),
                               cand_size,
                               axis=1)

    # concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb ], axis=2)
    # concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2))
    concate_claim_sent = T.concatenate([
        batch_claim_emb, batch_sent_emb,
        T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x')
    ],
                                       axis=2)
    concate_2_matrix = concate_claim_sent.reshape(
        (batch_size * cand_size, hidden_size[0] * 2 + 1))

    LR_input = concate_2_matrix
    LR_input_size = hidden_size[0] * 2 + 1
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 1, LR_input_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix)
    # loss = -T.mean(T.log(prob_pos))
    #f1 as loss
    batch_overlap = T.sum(sents_labels * inter_matrix, axis=1)
    batch_recall = batch_overlap / T.sum(sents_labels, axis=1)
    batch_precision = batch_overlap / T.sum(inter_matrix, axis=1)
    batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall +
                                                       batch_precision)
    loss = -T.mean(T.log(batch_f1))
    # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean()
    '''
    training task2, predict 3 labels
    '''
    joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    joint_embed_input_claim = init_embeddings[
        joint_claim_ids.flatten()].reshape(
            (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)
    joint_conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_sents,
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_sent_embeddings = joint_conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    joint_batch_sent_emb = joint_sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))
    joint_premise_emb = T.sum(joint_batch_sent_emb *
                              joint_sents_labels.dimshuffle(0, 1, 'x'),
                              axis=1)  #(batch, hidden_size)

    joint_conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_claim,
        mask_matrix=joint_claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_claim_embeddings = joint_conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    joint_premise_hypo_emb = T.concatenate(
        [joint_premise_emb, joint_claim_embeddings],
        axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    joint_sents_dot = T.batched_dot(
        joint_sents_tensor3, joint_sents_tensor3.dimshuffle(
            0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    joint_sents_dot_2_matrix = T.nnet.softmax(
        joint_sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    joint_sents_context = T.batched_dot(
        joint_sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        joint_sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    joint_add_sents_context = joint_embed_input_sents + joint_sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        joint_add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(joint_embed_input_claim, cand_size, axis=0),
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        mask_matrix_r=T.repeat(joint_claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    masked_sents_attconv = attentive_sent_embeddings_l * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    masked_claim_attconv = attentive_sent_embeddings_r * joint_sents_labels.dimshuffle(
        0, 1, 'x')
    fine_max = T.concatenate([
        T.max(masked_sents_attconv, axis=1),
        T.max(masked_claim_attconv, axis=1)
    ],
                             axis=1)  #(batch, 2*hidden)
    # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([joint_premise_hypo_emb, fine_max], axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    joint_loss = joint_layer_LR.negative_log_likelihood(
        joint_labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''
    # binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size

    masked_inter_matrix = inter_matrix * sents_labels  #(batch, cand_size)
    test_premise_emb = T.sum(batch_sent_emb *
                             masked_inter_matrix.dimshuffle(0, 1, 'x'),
                             axis=1)
    test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings],
                                          axis=1)

    #fine-maxsum
    sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(
        0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    sents_dot_2_matrix = T.nnet.softmax(
        sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    sents_context = T.batched_dot(
        sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    add_sents_context = embed_input_sents + sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    test_masked_sents_attconv = test_attentive_sent_embeddings_l * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_masked_claim_attconv = test_attentive_sent_embeddings_r * masked_inter_matrix.dimshuffle(
        0, 1, 'x')
    test_fine_max = T.concatenate([
        T.max(test_masked_sents_attconv, axis=1),
        T.max(test_masked_claim_attconv, axis=1)
    ],
                                  axis=1)  #(batch, 2*hidden)
    # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)

    test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max],
                                  axis=1)
    test_LR_input_size = joint_LR_input_size

    test_layer_LR = LogisticRegression(
        rng,
        input=test_LR_input,
        n_in=test_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector

    params = [init_embeddings] + NN_para + LR_para + joint_LR_para
    cost = loss + joint_loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids,
        joint_claim_mask, joint_labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    joint_n_train_batches = joint_train_size / batch_size
    joint_train_batch_start = list(
        np.arange(joint_n_train_batches) *
        batch_size) + [joint_train_size - batch_size]
    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    n_test_3th_batches = test_3th_size / batch_size
    test_3th_batch_start = list(np.arange(n_test_3th_batches) *
                                batch_size) + [test_3th_size - batch_size]

    max_acc = 0.0
    max_test_f1 = 0.0
    max_acc_full_evi = 0.0

    cost_i = 0.0
    joint_train_indices = range(joint_train_size)
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            joint_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for joint_batch_id in joint_train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1
            iter_accu += 1
            joint_train_id_batch = joint_train_indices[
                joint_batch_id:joint_batch_id + batch_size]
            for i in range(3):
                batch_id = random.choice(train_batch_start)
                train_id_batch = train_indices[batch_id:batch_id + batch_size]
                cost_i += train_model(
                    train_sents[train_id_batch],
                    train_sent_masks[train_id_batch],
                    train_sent_labels[train_id_batch],
                    train_claims[train_id_batch],
                    train_claim_mask[train_id_batch],
                    #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels
                    joint_train_sents[joint_train_id_batch],
                    joint_train_sent_masks[joint_train_id_batch],
                    joint_train_sent_labels[joint_train_id_batch],
                    joint_train_claims[joint_train_id_batch],
                    joint_train_claim_mask[joint_train_id_batch],
                    joint_train_labels[joint_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                f1_sum = 0.0
                error_sum = 0.0
                full_evi = 0
                predictions = []
                for test_batch_id in test_batch_start:  # for each test batch
                    batch_prob, error_i, pred_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_sent_masks[test_batch_id:test_batch_id +
                                        batch_size],
                        test_sent_labels[test_batch_id:test_batch_id +
                                         batch_size],
                        test_claims[test_batch_id:test_batch_id + batch_size],
                        test_claim_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_labels[test_batch_id:test_batch_id + batch_size])
                    error_sum += error_i
                    batch_sent_labels = test_sent_labels[
                        test_batch_id:test_batch_id + batch_size]
                    batch_sent_names = test_sent_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_names = test_ground_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_labels = test_labels[
                        test_batch_id:test_batch_id + batch_size]
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(
                            batch_ground_labels[i])
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        pred_sent_names = []
                        gold_sent_names = batch_ground_names[i]
                        zipped = [(batch_prob[i, k], batch_sent_labels[i][k],
                                   batch_sent_names[i][k])
                                  for k in range(cand_size)]
                        sorted_zip = sorted(zipped,
                                            key=lambda x: x[0],
                                            reverse=True)
                        for j in range(cand_size):
                            triple = sorted_zip[j]
                            if triple[1] == 1.0:
                                '''
                                we should consider a rank, instead of binary
                                if triple[0] >0.5: can control the recall, influence the strict_acc
                                '''
                                if triple[0] > 0.5:
                                    # pred_sent_names.append(batch_sent_names[i][j])
                                    pred_sent_names.append(triple[2])
                                # if len(pred_sent_names) == max_pred_pick:
                                #     break
                        instance_i['predicted_evidence'] = pred_sent_names
                        # print 'pred_sent_names:',pred_sent_names
                        # print 'gold_sent_names:',gold_sent_names
                        new_gold_names = []
                        for gold_name in gold_sent_names:
                            new_gold_names.append([None, None] + gold_name)
                        instance_i['evidence'] = [new_gold_names]
                        predictions.append(instance_i)
                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                # test_f1=f1_sum/(len(test_batch_start)*batch_size)

                for test_batch_id in test_3th_batch_start:  # for each test batch
                    _, error_i, pred_i = test_model(
                        test_3th_sents[test_batch_id:test_batch_id +
                                       batch_size],
                        test_3th_sent_masks[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_sent_labels[test_batch_id:test_batch_id +
                                             batch_size],
                        test_3th_claims[test_batch_id:test_batch_id +
                                        batch_size],
                        test_3th_claim_mask[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_labels[test_batch_id:test_batch_id +
                                        batch_size])
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(2)
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        instance_i['predicted_evidence'] = []
                        instance_i['evidence'] = []
                        predictions.append(instance_i)

                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=0.0000001,
                    extra_size=4,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 5],
                    maxSentLen=60,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id = load_NYT_dataset(
        maxSentLen, word2id)

    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    hidden_size[0],
                                                    filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))

    conv_W_2, conv_b_2 = create_conv_para(rng,
                                          filter_shape=(hidden_size[1], 1,
                                                        hidden_size[0],
                                                        filter_size[1]))
    conv_W_context_2, conv_b_context_2 = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))

    NN_para = [
        conv_W, conv_b, conv_W_context, conv_W_2, conv_b_2, conv_W_context_2
    ]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
        image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen),
        filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    attentive_conv_layer_2 = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
        image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen),
        filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[1]),
        filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1),
        W=conv_W_2,
        b=conv_b_2,
        W_context=conv_W_context_2,
        b_context=conv_b_context_2)

    attentive_sent_embeddings_l_2 = attentive_conv_layer_2.attentive_maxpool_vec_l
    attentive_sent_embeddings_r_2 = attentive_conv_layer_2.attentive_maxpool_vec_r

    #form input to HL layers
    HL_layer_1_input = T.concatenate([
        attentive_sent_embeddings_l, attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r,
        attentive_sent_embeddings_l_2, attentive_sent_embeddings_r_2,
        attentive_sent_embeddings_l_2 * attentive_sent_embeddings_r_2
    ],
                                     axis=1)

    HL_layer_1_input_size = 6 * hidden_size[1]

    HL_layer_1 = HiddenLayer(rng,
                             input=HL_layer_1_input,
                             n_in=HL_layer_1_input_size,
                             n_out=hidden_size[1],
                             activation=T.nnet.relu)
    HL_layer_2 = HiddenLayer(rng,
                             input=HL_layer_1.output,
                             n_in=hidden_size[1],
                             n_out=hidden_size[1],
                             activation=T.nnet.relu)

    # LR_input_size=HL_layer_1_input_size+2*hidden_size[0]
    "form input to LR classifier"
    LR_input = T.tanh(
        T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],
                      axis=1))
    LR_input_size = HL_layer_1_input_size + 2 * hidden_size[1]

    U_a = create_ensemble_para(
        rng, 3, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''

    test_preds = T.argmax(layer_LR.p_y_given_x, axis=1)
    transfered_preds = T.eq(test_preds, 2)
    test_error = T.mean(T.neq(transfered_preds, labels))

    params = [init_embeddings
              ] + NN_para + HL_layer_1.params + HL_layer_2.params + LR_para

    cost = loss

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        [test_error, transfered_preds],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    error_i, pred_labels_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                    pred_labels += list(pred_labels_i)
                    gold_labels += list(
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])
                test_acc = 1.0 - error_sum / (len(test_batch_start))
                test_f1 = f1_score_2_binary_list(
                    gold_labels, pred_labels)  #, average='binary')

                if test_acc > max_acc_test:
                    max_acc_test = test_acc
                if test_f1 > max_f1:
                    max_f1 = test_f1

                    # store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_five_copies_'+str(max_acc_test), params)

                print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test, '\t\t test_f1:', test_f1, '\t\tmax F1:', max_f1

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Example #8
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=1e-5,
                    extra_size=4,
                    emb_size=300,
                    batch_size=100,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    max_term_len=4,
                    p_mode='conc'):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1, all_word2, all_word1_mask, all_word2_mask, all_labels, all_extra, word2id = load_wordnet_hyper_vs_all_with_words(
        maxlen=maxSentLen, wordlen=max_term_len
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1, test_word2, test_word1_mask, test_word2_mask, test_labels, test_extra, word2id = load_task_hyper_vs_all_with_words(
        LenciBenotto_file, maxSentLen, word2id, wordlen=max_term_len)
    store_word2id(word2id, root_dic + 'LenciBenotto_word2id.pkl')
    # exit(0)
    total_size = len(all_sentences_l)
    hold_test_size = 10000
    train_size = total_size - hold_test_size

    train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_word1 = np.asarray(all_word1[:train_size], dtype='int32')
    train_word2 = np.asarray(all_word2[:train_size], dtype='int32')
    test_word1 = np.asarray(test_word1, dtype='int32')
    test_word2 = np.asarray(test_word2, dtype='int32')

    train_word1_mask = np.asarray(all_word1_mask[:train_size],
                                  dtype=theano.config.floatX)
    train_word2_mask = np.asarray(all_word2_mask[:train_size],
                                  dtype=theano.config.floatX)
    test_word1_mask = np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask = np.asarray(test_word2_mask, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[:train_size], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    train_extra = np.asarray(all_extra[:train_size],
                             dtype=theano.config.floatX)
    test_extra = np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable
    store_model_to_file(root_dic + 'LenciBenotto_best_para_init_embeddings',
                        [init_embeddings])

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape(
        (batch_size, word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape(
        (batch_size, word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1 * word1_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    word2_embedding = T.sum(embed_word2 * word2_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=word1_mask,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([
        tt_embeddings_l, tt_embeddings_r, tt_embeddings_l * tt_embeddings_r,
        tt_embeddings_l - tt_embeddings_r
    ],
                         axis=1)

    term_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_input_r,
        mask_matrix=word1_mask,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([
        td_embeddings_l, td_embeddings_r, td_embeddings_l * td_embeddings_r,
        td_embeddings_l - td_embeddings_r
    ],
                         axis=1)

    def_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=sents_mask_l,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([
        dt_embeddings_l, dt_embeddings_r, dt_embeddings_l * dt_embeddings_r,
        dt_embeddings_l - dt_embeddings_r
    ],
                         axis=1)

    def_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([
        dd_embeddings_l, dd_embeddings_r, dd_embeddings_l * dd_embeddings_r,
        dd_embeddings_l - dd_embeddings_r
    ],
                         axis=1)

    if p_mode == 'conc':
        p = T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4 * 4 * hidden_size[1]
    else:
        p = T.max(T.concatenate([
            p_ww.dimshuffle('x', 0, 1),
            p_wd.dimshuffle('x', 0, 1),
            p_dw.dimshuffle('x', 0, 1),
            p_dd.dimshuffle('x', 0, 1)
        ],
                                axis=0),
                  axis=0)
        p_len = 4 * hidden_size[1]

    # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    # HL_input_size=p_len+1+1
    #
    # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh)

    "form input to LR classifier"
    LR_input = T.concatenate([
        p,
        cosine_matrix1_matrix2_rowwise(word1_embedding,
                                       word2_embedding).dimshuffle(0, 'x'),
        extra.dimshuffle(0, 'x')
    ],
                             axis=1)
    LR_input_size = p_len + 1 + 1
    # LR_input = HL_layer_1.output
    # LR_input_size = hidden_size[1]

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng,
        input=LR_input,
        n_in=LR_input_size,
        n_out=2,
        W=U_a,
        b=LR_b,
        bias=0.25
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()

    params = NN_para + LR_para  #[init_embeddings]

    cost = loss  #+L2_weight*L2_reg

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra, labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    test_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra, labels
    ], [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    if n_test_remain != 0:
        test_batch_start = list(
            np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    else:
        test_batch_start = list(np.arange(n_test_batches) * batch_size)

    # max_acc_dev=0.0
    max_ap_test = 0.0
    max_ap_topk_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(
                train_sents_l[train_id_batch], train_masks_l[train_id_batch],
                train_sents_r[train_id_batch], train_masks_r[train_id_batch],
                train_word1[train_id_batch], train_word2[train_id_batch],
                train_word1_mask[train_id_batch],
                train_word2_mask[train_id_batch], train_extra[train_id_batch],
                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                probs = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    error_i, pred_i, prob_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_word1[test_batch_id:test_batch_id + batch_size],
                        test_word2[test_batch_id:test_batch_id + batch_size],
                        test_word1_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_word2_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_extra[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                    pred_labels += list(pred_i)
                    probs += list(prob_i)
                if n_test_remain != 0:
                    probs = probs[:(len(test_batch_start) - 1) *
                                  batch_size] + probs[-n_test_remain:]
                assert len(test_labels) == len(probs)
                # test_acc=1.0-error_sum/(len(test_batch_start))
                test_ap = apk(test_labels, probs, k=len(test_labels))
                test_ap_top100 = apk(test_labels, probs, k=100)

                if test_ap > max_ap_test:
                    max_ap_test = test_ap
                    store_model_to_file(
                        root_dic + 'LenciBenotto_best_para_' +
                        str(max_ap_test), params)
                if test_ap_top100 > max_ap_topk_test:
                    max_ap_topk_test = test_ap_top100
                print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    emb_size=40,
                    batch_size=50,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_reliefweb_il5_12_multilabel(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    # dev_sents=all_sentences[1]
    # dev_masks=all_masks[1]
    # dev_labels=all_labels[1]
    # dev_size=len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)

    # conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    # conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]))
    # NN_para = [conv_W, conv_b, conv_W2, conv_b2]
    # conv_model = Conv_with_Mask(rng, input_tensor3=common_input,
    #          mask_matrix = sents_mask,
    #          image_shape=(batch_size, 1, emb_size, maxSentLen),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    # sent_embeddings=conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    #
    # conv_model2 = Conv_with_Mask(rng, input_tensor3=common_input,
    #          mask_matrix = sents_mask,
    #          image_shape=(batch_size, 1, emb_size, maxSentLen),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2)    #mutiple mask with the conv_out to set the features by UNK to zero
    # sent_embeddings2=conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    #
    # LR_input = T.concatenate([sent_embeddings,sent_embeddings2, bow_emb], axis=1)
    # LR_input_size = hidden_size[0]*2+emb_size
    # #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    # U_a = create_ensemble_para(rng, 8, LR_input_size) # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    # LR_para=[U_a, LR_b]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    # score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 8
    # sub_labels = labels[:,:8]
    # prob_pos = T.where( sub_labels < 1, 1.0-score_matrix, score_matrix)
    # loss = -T.mean(T.log(prob_pos))
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.maxpooling_vec  #gru_layer.output_sent_rep  # (batch_size, hidden_size)

    LR_att_input = gru_sent_embeddings  #T.concatenate([gru_sent_embeddings, bow_emb], axis=1)
    LR_att_input_size = hidden_size[0]
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_att_a = create_ensemble_para(
        rng, 8, LR_att_input_size)  # the weight matrix hidden_size*2
    LR_att_b = theano.shared(value=np.zeros((8, ), dtype=theano.config.floatX),
                             name='LR_b',
                             borrow=True)  #bias for each target class
    LR_att_para = [U_att_a, LR_att_b]
    layer_att_LR = LogisticRegression(
        rng,
        input=LR_att_input,
        n_in=LR_att_input_size,
        n_out=8,
        W=U_att_a,
        b=LR_att_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax)  #batch * 12
    sub_labels = labels[:, :8]
    att_prob_pos = T.where(sub_labels < 1, 1.0 - att_score_matrix,
                           att_score_matrix)

    att_loss = -T.mean(T.log(att_prob_pos))
    params = GRU_NN_para + LR_att_para  # put all model parameters together
    cost = att_loss  #+Div_reg*diversify_reg#+L2_weight*L2_reg
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''
    binarize_prob = T.where(att_score_matrix > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_id_matrix, sents_mask, labels],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask],
                                 binarize_prob,
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0
        cost_i = 0.0
        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            # labels_matrix = []
            # for lab in train_labels[train_id_batch]:
            #     vec = [0]*8
            #     vec[lab]=1
            #     labels_matrix.append(vec)
            # labels_matrix = np.asarray(labels_matrix, dtype='int32')

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                # error_sum=0.0
                # for test_batch_id in test_batch_start: # for each test batch
                #     error_i, pred_labels=test_model(
                #                 test_sents[test_batch_id:test_batch_id+batch_size],
                #                 test_masks[test_batch_id:test_batch_id+batch_size])
                #     pred_labels=list(pred_labels)
                #     error_sum+=error_i
                #
                # test_accuracy=1.0-error_sum/(len(test_batch_start))
                # if test_accuracy > max_acc_test:
                #     max_acc_test=test_accuracy
                # print '\t\t\t\t\t\t\t\tcurrent testbacc:', test_accuracy, '\t\tmax_acc_test:', max_acc_test

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size])
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]

                    # gold_labels_matrix = []
                    # for lab in gold_labels:
                    #     vec = [0]*8
                    #     vec[lab]=1
                    #     gold_labels_matrix.append(vec)
                    # gold_labels_matrix = np.asarray(gold_labels_matrix, dtype='int32')

                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col_firstK(
                    all_pred_labels, all_gold_labels, 8)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #10
0
def evaluate_lenet5(learning_rate=0.01, n_epochs=100, emb_size=40, batch_size=50, describ_max_len=20, type_size=12,filter_size=[3,5], maxSentLen=100, hidden_size=[300,300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id=load_BBN_multi_labels_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len)
    label_sent=np.asarray(label_sent, dtype='int32')
    label_mask=np.asarray(label_mask, dtype=theano.config.floatX)


    train_sents=np.asarray(all_sentences[0], dtype='int32')
    train_masks=np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels=np.asarray(all_labels[0], dtype='int32')
    train_size=len(train_labels)

    dev_sents=np.asarray(all_sentences[1], dtype='int32')
    dev_masks=np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels=np.asarray(all_labels[1], dtype='int32')
    dev_size=len(dev_labels)

    test_sents=np.asarray(all_sentences[2], dtype='int32')
    test_masks=np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size=len(test_labels)

    vocab_size=  len(word2id)+1 # add one zero pad index

    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_fasttext_multiple_word2vec_given_file([emb_root+'IL5-cca-wiki-lorelei-d40.eng.vec',emb_root+'IL5-cca-wiki-lorelei-d40.IL5.vec'], 40)
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable


    #now, start to build the input form of the model
    sents_id_matrix=T.imatrix('sents_id_matrix')
    sents_mask=T.fmatrix('sents_mask')
    labels=T.imatrix('labels')  #batch*12

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input=embeddings[sents_id_matrix.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    repeat_common_input = T.repeat(normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen)

    des_input=embeddings[des_id_matrix.flatten()].reshape((type_size,describ_max_len, emb_size)).dimshuffle(0,2,1)
    repeat_des_input = T.tile(normalize_tensor3_colwise(des_input), (batch_size,1,1))#(batch_size*type_size, emb_size, maxsentlen)

    fine_grained_cosine = T.batched_dot(repeat_common_input.dimshuffle(0,2,1),repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len)

    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape((batch_size*type_size,maxSentLen*describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1)
    top_5_simi = sort_fine_grained_cosine_to_matrix[:,-30:] # (batch_size*type_size, 5)


    max_fine_grained_cosine = T.mean(top_5_simi, axis=1)




    cosine_scores = max_fine_grained_cosine.reshape((batch_size, type_size))
    # score_matrix = T.nnet.sigmoid(cosine_scores) #(batch_size, type_size)
    score_matrix = T.nnet.sigmoid(cosine_scores)

    prob_pos = T.where( labels < 1, 1.0-score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))


    # loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    do not update emb, so that can generalize from english to ils
    '''
    params = []#+NN_para+LR_para   # put all model parameters together
    cost=loss#+1e-4*((conv_W**2).sum()+(conv_W2**2).sum())
    updates =   Gradient_Cost_Para(cost,params, learning_rate)

    '''
    testing
    '''
    binarize_prob = T.where(cosine_scores > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask,  des_id_matrix, des_mask], binarize_prob, allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]


    # max_acc_dev=0.0
    max_meanf1_test=0.0
    max_weightf1_test=0.0
    train_indices = range(train_size)
    cost_i=0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu=0

        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]

            cost_i+= train_model(
                                train_sents[train_id_batch],
                                train_masks[train_id_batch],
                                train_labels[train_id_batch],
                                label_sent,
                                label_mask)

            #after each 1000 batches, we test the performance of the model on all test data
            if  iter%20==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()

                error_sum=0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start: # for each test batch
                    pred_labels=test_model(
                                test_sents[test_batch_id:test_batch_id+batch_size],
                                test_masks[test_batch_id:test_batch_id+batch_size],
                                label_sent,
                                label_mask)
                    gold_labels = test_labels[test_batch_id:test_batch_id+batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)


                test_mean_f1, test_weight_f1 =average_f1_two_array_by_col(all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test=test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test=test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test


        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    emb_size=40,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    # bow_mean_emb = bow_emb/T.sum(sents_mask,axis=1).dimshuffle(0,'x')

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]

    # NN_para = multiCNN_para+ACNN_para

    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 12, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)

    LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1)
    LR_att_input_size = hidden_size[0] + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_att_a = create_ensemble_para(
        rng, 12, LR_att_input_size)  # the weight matrix hidden_size*2
    LR_att_b = theano.shared(value=np.zeros((12, ),
                                            dtype=theano.config.floatX),
                             name='LR_b',
                             borrow=True)  #bias for each target class
    LR_att_para = [U_att_a, LR_att_b]
    layer_att_LR = LogisticRegression(
        rng,
        input=LR_att_input,
        n_in=LR_att_input_size,
        n_out=12,
        W=U_att_a,
        b=LR_att_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax)  #batch * 12
    att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix,
                           att_score_matrix)

    att_loss = -T.mean(T.log(att_prob_pos))
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    acnn_LR_input = T.concatenate(
        [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1)
    acnn_LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a = create_ensemble_para(
        rng, 12, acnn_LR_input_size)  # the weight matrix hidden_size*2
    acnn_LR_b = theano.shared(value=np.zeros((12, ),
                                             dtype=theano.config.floatX),
                              name='LR_b',
                              borrow=True)  #bias for each target class
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)

    acnn_loss = -T.mean(T.log(acnn_prob_pos))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)

    params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para  # put all model parameters together
    cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() +
                                                 (conv_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = T.max(T.concatenate([
        att_score_matrix.dimshuffle('x', 0, 1),
        score_matrix.dimshuffle('x', 0, 1),
        acnn_score_matrix.dimshuffle('x', 0, 1)
    ],
                                             axis=0),
                               axis=0)
    ensemble_scores = 0.5 * ensemble_NN_scores + 0.5 * cosine_score_matrix
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        binarize_prob,
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    cost_i = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch], label_sent,
                                  label_mask)

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #12
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=70,
                    filter_size=[3],
                    maxSentLen=40,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered"
    "words keep random initialization"
    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in'
    'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) '
    embed_input_l = init_embeddings[sents_ids_l.flatten(
    )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)
    '''create parameters for attentive convolution function '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    "Logistic Regression layer"
    LR_input = T.concatenate([
        attentive_sent_embeddings_l, attentive_sent_embeddings_r,
        attentive_sent_embeddings_l + attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = 4 * hidden_size[0]

    U_a = create_ensemble_para(rng, 3, LR_input_size)  # (input_size, 3)
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng,
        input=normalize_matrix_col_wise(LR_input),
        n_in=LR_input_size,
        n_out=3,
        W=U_a,
        b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [init_embeddings] + NN_para + LR_para
    cost = loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 1000 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                '''
                test
                '''
                error_sum = 0.0
                for test_batch_id in test_batch_start:  # for each test batch
                    error_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                test_acc = 1.0 - error_sum / (len(test_batch_start))

                if test_acc > max_acc_test:
                    max_acc_test = test_acc
                print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=0.0000001,
                    extra_size=4,
                    use_svm=False,
                    drop_p=0.1,
                    div_weight=0.00001,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    margin=0.1,
                    comment='five copies, sum&majority'):

    model_options = locals().copy()
    print "model options", model_options

    first_seed = 1234
    np.random.seed(first_seed)
    first_rng = np.random.RandomState(
        first_seed)  #random seed, control the model generates the same results
    first_srng = RandomStreams(first_rng.randint(999999))

    second_seed = 2345
    np.random.seed(second_seed)
    second_rng = np.random.RandomState(
        second_seed
    )  #random seed, control the model generates the same results
    second_srng = RandomStreams(second_rng.randint(888888))

    third_seed = 3456
    np.random.seed(third_seed)
    third_rng = np.random.RandomState(
        third_seed)  #random seed, control the model generates the same results
    third_srng = RandomStreams(third_rng.randint(777777))

    fourth_seed = 4567
    np.random.seed(fourth_seed)
    fourth_rng = np.random.RandomState(
        fourth_seed
    )  #random seed, control the model generates the same results
    fourth_srng = RandomStreams(fourth_rng.randint(666666))

    fifth_seed = 5678
    np.random.seed(fifth_seed)
    fifth_rng = np.random.RandomState(
        fifth_seed)  #random seed, control the model generates the same results
    fifth_srng = RandomStreams(fifth_rng.randint(555555))

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_extra, all_labels, word2id = load_SNLI_dataset_with_extra(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    #     train_sents_l = np.concatenate((train_sents_l, dev_sents_l), axis=0)
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    #     train_masks_l = np.concatenate((train_masks_l, dev_masks_l), axis=0)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    #     train_sents_r = np.concatenate((train_sents_r, dev_sents_r), axis=0)
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    #     train_masks_r = np.concatenate((train_masks_r, dev_masks_r), axis=0)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_extra = np.asarray(all_extra[0], dtype=theano.config.floatX)
    dev_extra = np.asarray(all_extra[1], dtype=theano.config.floatX)
    test_extra = np.asarray(all_extra[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    #     train_labels_store = np.concatenate((train_labels_store, dev_labels_store), axis=0)
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = first_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    #     word2vec =extend_word2vec_lowercase(word2vec)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    #     normed_matrix = normalize(rand_values, axis=0, norm='l2')
    first_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    second_rand_values = second_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    second_rand_values[0] = np.array(np.zeros(emb_size),
                                     dtype=theano.config.floatX)
    second_rand_values = load_word2vec_to_init(second_rand_values, id2word,
                                               word2vec)
    second_embeddings = theano.shared(
        value=np.array(second_rand_values, dtype=theano.config.floatX),
        borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    third_rand_values = third_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    third_rand_values[0] = np.array(np.zeros(emb_size),
                                    dtype=theano.config.floatX)
    third_rand_values = load_word2vec_to_init(third_rand_values, id2word,
                                              word2vec)
    third_embeddings = theano.shared(value=np.array(
        third_rand_values, dtype=theano.config.floatX),
                                     borrow=True)

    fourth_rand_values = fourth_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    fourth_rand_values[0] = np.array(np.zeros(emb_size),
                                     dtype=theano.config.floatX)
    fourth_rand_values = load_word2vec_to_init(fourth_rand_values, id2word,
                                               word2vec)
    fourth_embeddings = theano.shared(value=np.array(
        fourth_rand_values, dtype=theano.config.floatX),
                                      borrow=True)

    fifth_rand_values = fifth_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    fifth_rand_values[0] = np.array(np.zeros(emb_size),
                                    dtype=theano.config.floatX)
    fifth_rand_values = load_word2vec_to_init(fifth_rand_values, id2word,
                                              word2vec)
    fifth_embeddings = theano.shared(value=np.array(
        fifth_rand_values, dtype=theano.config.floatX),
                                     borrow=True)

    #now, start to build the input form of the model
    train_flag = T.iscalar()
    first_sents_ids_l = T.imatrix()
    first_sents_mask_l = T.fmatrix()
    first_sents_ids_r = T.imatrix()
    first_sents_mask_r = T.fmatrix()
    first_labels = T.ivector()
    second_sents_ids_l = T.imatrix()
    second_sents_mask_l = T.fmatrix()
    second_sents_ids_r = T.imatrix()
    second_sents_mask_r = T.fmatrix()
    second_labels = T.ivector()
    third_sents_ids_l = T.imatrix()
    third_sents_mask_l = T.fmatrix()
    third_sents_ids_r = T.imatrix()
    third_sents_mask_r = T.fmatrix()
    third_labels = T.ivector()
    fourth_sents_ids_l = T.imatrix()
    fourth_sents_mask_l = T.fmatrix()
    fourth_sents_ids_r = T.imatrix()
    fourth_sents_mask_r = T.fmatrix()
    fourth_labels = T.ivector()
    fifth_sents_ids_l = T.imatrix()
    fifth_sents_mask_l = T.fmatrix()
    fifth_sents_ids_r = T.imatrix()
    fifth_sents_mask_r = T.fmatrix()
    fifth_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def common_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    first_common_input_l = common_input(
        first_embeddings, first_sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    first_common_input_r = common_input(
        first_embeddings, first_sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    second_common_input_l = common_input(
        second_embeddings, second_sents_ids_l
    )  #second_embeddings[second_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    second_common_input_r = common_input(
        second_embeddings, second_sents_ids_r
    )  #second_embeddings[second_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    third_common_input_l = common_input(
        third_embeddings, third_sents_ids_l
    )  #third_embeddings[third_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    third_common_input_r = common_input(
        third_embeddings, third_sents_ids_r
    )  #third_embeddings[third_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    fourth_common_input_l = common_input(
        fourth_embeddings, fourth_sents_ids_l
    )  #fourth_embeddings[fourth_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    fourth_common_input_r = common_input(
        fourth_embeddings, fourth_sents_ids_r
    )  #fourth_embeddings[fourth_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    fifth_common_input_l = common_input(
        fifth_embeddings, fifth_sents_ids_l
    )  #fifth_embeddings[fifth_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    fifth_common_input_r = common_input(
        fifth_embeddings, fifth_sents_ids_r
    )  #fifth_embeddings[fifth_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    gate_filter_shape = (hidden_size[0], 1, emb_size, 1)

    def create_CNN_params(rng):
        conv_W_2_pre, conv_b_2_pre = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2_gate, conv_b_2_gate = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2, conv_b_2 = create_conv_para(rng,
                                              filter_shape=(hidden_size[1], 1,
                                                            hidden_size[0],
                                                            filter_size[0]))
        conv_W_2_context, conv_b_2_context = create_conv_para(
            rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))
        return conv_W_2_pre, conv_b_2_pre, conv_W_2_gate, conv_b_2_gate, conv_W_2, conv_b_2, conv_W_2_context, conv_b_2_context

    first_conv_W_pre, first_conv_b_pre, first_conv_W_gate, first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context, first_conv_b_context = create_CNN_params(
        first_rng)
    second_conv_W_pre, second_conv_b_pre, second_conv_W_gate, second_conv_b_gate, second_conv_W, second_conv_b, second_conv_W_context, second_conv_b_context = create_CNN_params(
        second_rng)
    third_conv_W_pre, third_conv_b_pre, third_conv_W_gate, third_conv_b_gate, third_conv_W, third_conv_b, third_conv_W_context, third_conv_b_context = create_CNN_params(
        third_rng)
    fourth_conv_W_pre, fourth_conv_b_pre, fourth_conv_W_gate, fourth_conv_b_gate, fourth_conv_W, fourth_conv_b, fourth_conv_W_context, fourth_conv_b_context = create_CNN_params(
        fourth_rng)
    fifth_conv_W_pre, fifth_conv_b_pre, fifth_conv_W_gate, fifth_conv_b_gate, fifth_conv_W, fifth_conv_b, fifth_conv_W_context, fifth_conv_b_context = create_CNN_params(
        fifth_rng)
    '''
    dropout paras
    '''

    def dropout_group(rng, conv_W_2_pre, conv_W_2_gate, conv_W_2,
                      conv_W_2_context):
        drop_conv_W_2_pre = dropout_layer(rng, conv_W_2_pre, drop_p,
                                          train_flag)
        drop_conv_W_2_gate = dropout_layer(rng, conv_W_2_gate, drop_p,
                                           train_flag)
        drop_conv_W_2 = dropout_layer(rng, conv_W_2, drop_p, train_flag)
        drop_conv_W_2_context = dropout_layer(rng, conv_W_2_context, drop_p,
                                              train_flag)
        return drop_conv_W_2_pre, drop_conv_W_2_gate, drop_conv_W_2, drop_conv_W_2_context

    drop_first_conv_W_pre, drop_first_conv_W_gate, drop_first_conv_W, drop_first_conv_W_context = dropout_group(
        first_srng, first_conv_W_pre, first_conv_W_gate, first_conv_W,
        first_conv_W_context)
    drop_second_conv_W_pre, drop_second_conv_W_gate, drop_second_conv_W, drop_second_conv_W_context = dropout_group(
        second_srng, second_conv_W_pre, second_conv_W_gate, second_conv_W,
        second_conv_W_context)
    drop_third_conv_W_pre, drop_third_conv_W_gate, drop_third_conv_W, drop_third_conv_W_context = dropout_group(
        third_srng, third_conv_W_pre, third_conv_W_gate, third_conv_W,
        third_conv_W_context)
    drop_fourth_conv_W_pre, drop_fourth_conv_W_gate, drop_fourth_conv_W, drop_fourth_conv_W_context = dropout_group(
        fourth_srng, fourth_conv_W_pre, fourth_conv_W_gate, fourth_conv_W,
        fourth_conv_W_context)
    drop_fifth_conv_W_pre, drop_fifth_conv_W_gate, drop_fifth_conv_W, drop_fifth_conv_W_context = dropout_group(
        fifth_srng, fifth_conv_W_pre, fifth_conv_W_gate, fifth_conv_W,
        fifth_conv_W_context)

    first_NN_para = [  #conv_W, conv_b,
        first_conv_W_pre, first_conv_b_pre, first_conv_W_gate,
        first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context
    ]
    second_NN_para = [
        second_conv_W_pre, second_conv_b_pre, second_conv_W_gate,
        second_conv_b_gate, second_conv_W, second_conv_b, second_conv_W_context
    ]

    third_NN_para = [
        third_conv_W_pre, third_conv_b_pre, third_conv_W_gate,
        third_conv_b_gate, third_conv_W, third_conv_b, third_conv_W_context
    ]
    fourth_NN_para = [
        fourth_conv_W_pre, fourth_conv_b_pre, fourth_conv_W_gate,
        fourth_conv_b_gate, fourth_conv_W, fourth_conv_b, fourth_conv_W_context
    ]

    fifth_NN_para = [
        fifth_conv_W_pre, fifth_conv_b_pre, fifth_conv_W_gate,
        fifth_conv_b_gate, fifth_conv_W, fifth_conv_b, fifth_conv_W_context
    ]
    '''
    first classifier
    '''

    def classifier(rng, common_input_l, common_input_r, sents_mask_l,
                   sents_mask_r, drop_conv_W_2_pre, conv_b_2_pre,
                   drop_conv_W_2_gate, conv_b_2_gate, drop_conv_W_2, conv_b_2,
                   drop_conv_W_2_context, conv_b_2_context, labels):
        conv_layer_2_gate_l = Conv_with_Mask_with_Gate(
            rng,
            input_tensor3=common_input_l,
            mask_matrix=sents_mask_l,
            image_shape=(batch_size, 1, emb_size, maxSentLen),
            filter_shape=gate_filter_shape,
            W=drop_conv_W_2_pre,
            b=conv_b_2_pre,
            W_gate=drop_conv_W_2_gate,
            b_gate=conv_b_2_gate)
        conv_layer_2_gate_r = Conv_with_Mask_with_Gate(
            rng,
            input_tensor3=common_input_r,
            mask_matrix=sents_mask_r,
            image_shape=(batch_size, 1, emb_size, maxSentLen),
            filter_shape=gate_filter_shape,
            W=drop_conv_W_2_pre,
            b=conv_b_2_pre,
            W_gate=drop_conv_W_2_gate,
            b_gate=conv_b_2_gate)

        l_input_4_att = conv_layer_2_gate_l.output_tensor3  #conv_layer_2_gate_l.masked_conv_out_sigmoid*conv_layer_2_pre_l.masked_conv_out+(1.0-conv_layer_2_gate_l.masked_conv_out_sigmoid)*common_input_l
        r_input_4_att = conv_layer_2_gate_r.output_tensor3  #conv_layer_2_gate_r.masked_conv_out_sigmoid*conv_layer_2_pre_r.masked_conv_out+(1.0-conv_layer_2_gate_r.masked_conv_out_sigmoid)*common_input_r

        conv_layer_2 = Conv_for_Pair(
            rng,
            origin_input_tensor3=common_input_l,
            origin_input_tensor3_r=common_input_r,
            input_tensor3=l_input_4_att,
            input_tensor3_r=r_input_4_att,
            mask_matrix=sents_mask_l,
            mask_matrix_r=sents_mask_r,
            image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
            image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen),
            filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]),
            filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1),
            W=drop_conv_W_2,
            b=conv_b_2,
            W_context=drop_conv_W_2_context,
            b_context=conv_b_2_context)
        attentive_sent_embeddings_l_2 = conv_layer_2.attentive_maxpool_vec_l
        attentive_sent_embeddings_r_2 = conv_layer_2.attentive_maxpool_vec_r
        # attentive_sent_sumpool_l_2 = conv_layer_2.attentive_sumpool_vec_l
        # attentive_sent_sumpool_r_2 = conv_layer_2.attentive_sumpool_vec_r

        HL_layer_1_input = T.concatenate([
            attentive_sent_embeddings_l_2, attentive_sent_embeddings_r_2,
            attentive_sent_embeddings_l_2 * attentive_sent_embeddings_r_2
        ],
                                         axis=1)

        HL_layer_1_input_size = hidden_size[
            1] * 3  #+extra_size#+(maxSentLen*2+10*2)#+hidden_size[1]*3+1

        HL_layer_1 = HiddenLayer(rng,
                                 input=HL_layer_1_input,
                                 n_in=HL_layer_1_input_size,
                                 n_out=hidden_size[0],
                                 activation=T.nnet.relu)
        HL_layer_2 = HiddenLayer(rng,
                                 input=HL_layer_1.output,
                                 n_in=hidden_size[0],
                                 n_out=hidden_size[0],
                                 activation=T.nnet.relu)

        LR_input_size = HL_layer_1_input_size + 2 * hidden_size[0]
        U_a = create_ensemble_para(
            rng, 3, LR_input_size)  # the weight matrix hidden_size*2
        LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                             name='LR_b',
                             borrow=True)  #bias for each target class
        LR_para = [U_a, LR_b]

        LR_input = T.tanh(
            T.concatenate(
                [HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],
                axis=1))
        layer_LR = LogisticRegression(
            rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b
        )  #basically it is a multiplication between weight matrix and input feature vector
        loss = layer_LR.negative_log_likelihood(
            labels
        )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

        return loss, LR_para + HL_layer_1.params + HL_layer_2.params, layer_LR.p_y_given_x, layer_LR.errors(
            labels)

    first_loss, first_classifier_params, first_test_distr, first_error = classifier(
        first_rng, first_common_input_l, first_common_input_r,
        first_sents_mask_l, first_sents_mask_r, drop_first_conv_W_pre,
        first_conv_b_pre, drop_first_conv_W_gate, first_conv_b_gate,
        drop_first_conv_W, first_conv_b, drop_first_conv_W_context,
        first_conv_b_context, first_labels)
    second_loss, second_classifier_params, second_test_distr, second_error = classifier(
        second_rng, second_common_input_l, second_common_input_r,
        second_sents_mask_l, second_sents_mask_r, drop_second_conv_W_pre,
        second_conv_b_pre, drop_second_conv_W_gate, second_conv_b_gate,
        drop_second_conv_W, second_conv_b, drop_second_conv_W_context,
        second_conv_b_context, second_labels)
    third_loss, third_classifier_params, third_test_distr, third_error = classifier(
        third_rng, third_common_input_l, third_common_input_r,
        third_sents_mask_l, third_sents_mask_r, drop_third_conv_W_pre,
        third_conv_b_pre, drop_third_conv_W_gate, third_conv_b_gate,
        drop_third_conv_W, third_conv_b, drop_third_conv_W_context,
        third_conv_b_context, third_labels)
    fourth_loss, fourth_classifier_params, fourth_test_distr, fourth_error = classifier(
        fourth_rng, fourth_common_input_l, fourth_common_input_r,
        fourth_sents_mask_l, fourth_sents_mask_r, drop_fourth_conv_W_pre,
        fourth_conv_b_pre, drop_fourth_conv_W_gate, fourth_conv_b_gate,
        drop_fourth_conv_W, fourth_conv_b, drop_fourth_conv_W_context,
        fourth_conv_b_context, fourth_labels)
    fifth_loss, fifth_classifier_params, fifth_test_distr, fifth_error = classifier(
        fifth_rng, fifth_common_input_l, fifth_common_input_r,
        fifth_sents_mask_l, fifth_sents_mask_r, drop_fifth_conv_W_pre,
        fifth_conv_b_pre, drop_fifth_conv_W_gate, fifth_conv_b_gate,
        drop_fifth_conv_W, fifth_conv_b, drop_fifth_conv_W_context,
        fifth_conv_b_context, fifth_labels)
    '''
    testing, labels == second_labels
    '''
    all_prop_distr = first_test_distr + second_test_distr + third_test_distr + fourth_test_distr + fifth_test_distr
    first_preds = T.argmax(first_test_distr, axis=1).dimshuffle('x',
                                                                0)  #(1, batch)
    second_preds = T.argmax(second_test_distr,
                            axis=1).dimshuffle('x', 0)  #(1, batch)
    third_preds = T.argmax(third_test_distr, axis=1).dimshuffle('x',
                                                                0)  #(1, batch)
    fourth_preds = T.argmax(fourth_test_distr,
                            axis=1).dimshuffle('x', 0)  #(1, batch)
    fifth_preds = T.argmax(fifth_test_distr, axis=1).dimshuffle('x',
                                                                0)  #(1, batch)
    overall_preds = T.concatenate(
        [first_preds, second_preds, third_preds, fourth_preds, fifth_preds],
        axis=0)  #(5, batch)
    all_error = T.mean(T.neq(T.argmax(all_prop_distr, axis=1), first_labels))

    #     neg_labels = T.where( labels < 2, 2, labels-1)
    #     loss2=-T.mean(T.log(1.0/(1.0+layer_LR.p_y_given_x))[T.arange(neg_labels.shape[0]), neg_labels])

    # rank loss
    # entail_prob_batch = T.nnet.softmax(layer_LR.before_softmax.T)[2] #batch
    # entail_ids = elementwise_is_two(labels)
    # entail_probs = entail_prob_batch[entail_ids.nonzero()]
    # non_entail_probs = entail_prob_batch[(1-entail_ids).nonzero()]
    #
    # repeat_entail = T.extra_ops.repeat(entail_probs, non_entail_probs.shape[0], axis=0)
    # repeat_non_entail = T.extra_ops.repeat(non_entail_probs.dimshuffle('x',0), entail_probs.shape[0], axis=0).flatten()
    # loss2 = -T.mean(T.log(entail_probs))#T.mean(T.maximum(0.0, margin-repeat_entail+repeat_non_entail))

    # zero_matrix = T.zeros((batch_size, 3))
    # filled_zero_matrix = T.set_subtensor(zero_matrix[T.arange(batch_size), labels], 1.0)
    # prob_batch_posi = layer_LR.p_y_given_x[filled_zero_matrix.nonzero()]
    # prob_batch_nega = layer_LR.p_y_given_x[(1-filled_zero_matrix).nonzero()]
    #
    # repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0)
    # repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten()
    # loss2 = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega))

    first_params = [first_embeddings] + first_NN_para + first_classifier_params
    second_params = [second_embeddings
                     ] + second_NN_para + second_classifier_params
    third_params = [third_embeddings] + third_NN_para + third_classifier_params
    fourth_params = [fourth_embeddings
                     ] + fourth_NN_para + fourth_classifier_params
    fifth_params = [fifth_embeddings] + fifth_NN_para + fifth_classifier_params

    params = first_params + second_params + third_params + fourth_params + fifth_params

    #     L2_reg =L2norm_paraList([embeddings,HL_layer_1.W, HL_layer_2.W])

    #     diversify_reg= (Diversify_Reg(conv_W_2_pre_to_matrix)+Diversify_Reg(conv_W_2_gate_to_matrix)+
    #                     Diversify_Reg(conv_W_2_to_matrix)+Diversify_Reg(conv_W_2_context_to_matrix))

    cost = first_loss + second_loss + third_loss + fourth_loss + fifth_loss  #+0.1*loss2#+loss2#+L2_weight*L2_reg

    first_updates = Gradient_Cost_Para(first_loss, first_params, learning_rate)
    second_updates = Gradient_Cost_Para(second_loss, second_params,
                                        learning_rate)
    third_updates = Gradient_Cost_Para(third_loss, third_params, learning_rate)
    fourth_updates = Gradient_Cost_Para(fourth_loss, fourth_params,
                                        learning_rate)
    fifth_updates = Gradient_Cost_Para(fifth_loss, fifth_params, learning_rate)
    updates = first_updates + second_updates + third_updates + fourth_updates + fifth_updates

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([
        train_flag, first_sents_ids_l, first_sents_mask_l, first_sents_ids_r,
        first_sents_mask_r, first_labels, second_sents_ids_l,
        second_sents_mask_l, second_sents_ids_r, second_sents_mask_r,
        second_labels, third_sents_ids_l, third_sents_mask_l,
        third_sents_ids_r, third_sents_mask_r, third_labels,
        fourth_sents_ids_l, fourth_sents_mask_l, fourth_sents_ids_r,
        fourth_sents_mask_r, fourth_labels, fifth_sents_ids_l,
        fifth_sents_mask_l, fifth_sents_ids_r, fifth_sents_mask_r, fifth_labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    #     train_model_pred = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra,labels,
    #                                         second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], [LR_input, labels], allow_input_downcast=True, on_unused_input='ignore')
    #
    #     dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra, labels,
    #                                  second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([
        train_flag, first_sents_ids_l, first_sents_mask_l, first_sents_ids_r,
        first_sents_mask_r, first_labels, second_sents_ids_l,
        second_sents_mask_l, second_sents_ids_r, second_sents_mask_r,
        second_labels, third_sents_ids_l, third_sents_mask_l,
        third_sents_ids_r, third_sents_mask_r, third_labels,
        fourth_sents_ids_l, fourth_sents_mask_l, fourth_sents_ids_r,
        fourth_sents_mask_r, fourth_labels, fifth_sents_ids_l,
        fifth_sents_mask_l, fifth_sents_ids_r, fifth_sents_mask_r, fifth_labels
    ], [
        first_error, second_error, third_error, fourth_error, fifth_error,
        all_error, overall_preds
    ],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    max_acc_svm = 0.0
    max_acc_lr = 0.0
    max_majority_acc = 0.0

    cost_i = 0.0
    first_train_indices = range(train_size)
    second_train_indices = range(train_size)
    third_train_indices = range(train_size)
    fourth_train_indices = range(train_size)
    fifth_train_indices = range(train_size)
    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            first_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        random.Random(200).shuffle(second_train_indices)
        random.Random(300).shuffle(third_train_indices)
        random.Random(400).shuffle(fourth_train_indices)
        random.Random(500).shuffle(fifth_train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            first_train_id_batch = first_train_indices[batch_id:batch_id +
                                                       batch_size]
            second_train_id_batch = second_train_indices[batch_id:batch_id +
                                                         batch_size]
            third_train_id_batch = third_train_indices[batch_id:batch_id +
                                                       batch_size]
            fourth_train_id_batch = fourth_train_indices[batch_id:batch_id +
                                                         batch_size]
            fifth_train_id_batch = fifth_train_indices[batch_id:batch_id +
                                                       batch_size]
            cost_i += train_model(1, train_sents_l[first_train_id_batch],
                                  train_masks_l[first_train_id_batch],
                                  train_sents_r[first_train_id_batch],
                                  train_masks_r[first_train_id_batch],
                                  train_labels_store[first_train_id_batch],
                                  train_sents_l[second_train_id_batch],
                                  train_masks_l[second_train_id_batch],
                                  train_sents_r[second_train_id_batch],
                                  train_masks_r[second_train_id_batch],
                                  train_labels_store[second_train_id_batch],
                                  train_sents_l[third_train_id_batch],
                                  train_masks_l[third_train_id_batch],
                                  train_sents_r[third_train_id_batch],
                                  train_masks_r[third_train_id_batch],
                                  train_labels_store[third_train_id_batch],
                                  train_sents_l[fourth_train_id_batch],
                                  train_masks_l[fourth_train_id_batch],
                                  train_sents_r[fourth_train_id_batch],
                                  train_masks_r[fourth_train_id_batch],
                                  train_labels_store[fourth_train_id_batch],
                                  train_sents_l[fifth_train_id_batch],
                                  train_masks_l[fifth_train_id_batch],
                                  train_sents_r[fifth_train_id_batch],
                                  train_masks_r[fifth_train_id_batch],
                                  train_labels_store[fifth_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % int(2000 * (50.0 / batch_size)) == 0:
                #             if iter%int(200*(50.0 / batch_size))==0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0:
                #     print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                #     past_time = time.time()

                #                 error_sum=0.0
                #                 for dev_batch_id in dev_batch_start: # for each test batch
                #                     error_i=dev_model(
                #                                 dev_sents_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_sents_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_labels_store[dev_batch_id:dev_batch_id+batch_size]
                #                                 )
                #
                #                     error_sum+=error_i
                #                 dev_accuracy=1.0-error_sum/(len(dev_batch_start))
                #                 if dev_accuracy > max_acc_dev:
                #                     max_acc_dev=dev_accuracy
                #                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev
                #best dev model, do test
                error_sum_1 = 0.0
                error_sum_2 = 0.0
                error_sum_3 = 0.0
                error_sum_4 = 0.0
                error_sum_5 = 0.0
                error_both = 0.0
                ys = []
                gold_ys = []
                for test_batch_id in test_batch_start:  # for each test batch
                    error_1, error_2, error_3, error_4, error_5, both_error_batch, batch_ys = test_model(
                        0, test_sents_l[test_batch_id:test_batch_id +
                                        batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size],
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size],
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size],
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size],
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum_1 += error_1
                    error_sum_2 += error_2
                    error_sum_3 += error_3
                    error_sum_4 += error_4
                    error_sum_5 += error_5
                    error_both += both_error_batch
                    ys.append(batch_ys)
                    gold_ys.append(
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])
                test_acc_1 = 1.0 - error_sum_1 / (len(test_batch_start))
                test_acc_2 = 1.0 - error_sum_2 / (len(test_batch_start))
                test_acc_3 = 1.0 - error_sum_3 / (len(test_batch_start))
                test_acc_4 = 1.0 - error_sum_4 / (len(test_batch_start))
                test_acc_5 = 1.0 - error_sum_5 / (len(test_batch_start))

                all_ys = np.concatenate(ys, axis=1)  #(5, test_size)
                gold_ys = np.concatenate(gold_ys)
                majority_ys = mode(np.transpose(all_ys), axis=-1)[0][:, 0]
                majority_acc = 1.0 - np.not_equal(
                    gold_ys, majority_ys).sum() * 1.0 / len(gold_ys)
                test_acc_both = 1.0 - error_both / (len(test_batch_start))
                if test_acc_1 > max_acc_test:
                    max_acc_test = test_acc_1
                if test_acc_2 > max_acc_test:
                    max_acc_test = test_acc_2
                if test_acc_3 > max_acc_test:
                    max_acc_test = test_acc_3
                if test_acc_4 > max_acc_test:
                    max_acc_test = test_acc_4
                if test_acc_5 > max_acc_test:
                    max_acc_test = test_acc_5

                if test_acc_both > max_acc_test:
                    max_acc_test = test_acc_both
                    store_model_to_file(
                        '/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_five_copies_'
                        + str(max_acc_test), params)
                if majority_acc > max_majority_acc:
                    max_majority_acc = majority_acc

                print '\t\tcurrent acc:', test_acc_1, test_acc_2, test_acc_3, test_acc_4, test_acc_5, ' ; ', test_acc_both, majority_acc, '\t\t\t\t\tmax_acc:', max_acc_test, max_majority_acc


#                 else:
#                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Example #14
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=70,
                    filter_size=[3, 1],
                    maxSentLen=70,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered"
    "words keep random initialization"
    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in'
    'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) '
    embed_input_l = init_embeddings[sents_ids_l.flatten(
    )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)
    '''create parameters for attentive convolution function '''
    gate_filter_shape = (emb_size, 1, emb_size, 1)
    conv_W_pre, conv_b_pre = create_conv_para(rng,
                                              filter_shape=gate_filter_shape)
    conv_W_gate, conv_b_gate = create_conv_para(rng,
                                                filter_shape=gate_filter_shape)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    conv_W2_context, conv_b2_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, conv_W_context, conv_W_pre, conv_b_pre, conv_W_gate,
        conv_b_gate, conv_W2, conv_b2, conv_W2_context
    ]

    "A gated convolution layer to form more expressive word representations in each sentence"
    "input tensor3 (batch_size, emb_size, sen_length), output tensor3 (batch_size, emb_size, sen_length)"
    conv_layer_gate_l = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_l,
        mask_matrix=sents_mask_l,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    conv_layer_gate_r = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_r,
        mask_matrix=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    '''
    attentive convolution function, two sizes of filter_width 3&1 are used. Multi-channel
    '''

    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W2,
        b=conv_b2,
        W_context=conv_W2_context,
        b_context=conv_b2_context)
    attentive_sent_embeddings_l2 = attentive_conv_layer2.attentive_maxpool_vec_l
    attentive_sent_embeddings_r2 = attentive_conv_layer2.attentive_maxpool_vec_r

    "Batch normalization for the four output sentence representation vectors"
    gamma = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[0]),
        high=1.0 / math.sqrt(hidden_size[0]),
        size=(hidden_size[0])),
                                     dtype=theano.config.floatX),
                          borrow=True)
    beta = theano.shared(np.zeros((hidden_size[0]),
                                  dtype=theano.config.floatX),
                         borrow=True)
    bn_params = [gamma, beta]
    bn_attentive_sent_embeddings_l = batch_normalization(
        inputs=attentive_sent_embeddings_l,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r = batch_normalization(
        inputs=attentive_sent_embeddings_r,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r.std((0, ), keepdims=True),
        mode='low_mem')

    bn_attentive_sent_embeddings_l2 = batch_normalization(
        inputs=attentive_sent_embeddings_l2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l2.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r2 = batch_normalization(
        inputs=attentive_sent_embeddings_r2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r2.std((0, ), keepdims=True),
        mode='low_mem')

    "Before logistic regression layer, we insert a hidden layer. Now form input to HL classifier"
    HL_layer_1_input = T.concatenate([
        bn_attentive_sent_embeddings_l, bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l + bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l * bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l2, bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 + bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 * bn_attentive_sent_embeddings_r2
    ],
                                     axis=1)
    HL_layer_1_input_size = 8 * hidden_size[0]
    "Create hidden layer parameters"
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, HL_layer_1_input_size, hidden_size[1])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    "Hidden Layer and batch norm to its output again"
    HL_layer_1 = HiddenLayer(rng,
                             input=HL_layer_1_input,
                             n_in=HL_layer_1_input_size,
                             n_out=hidden_size[1],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    gamma_HL = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[1]),
        high=1.0 / math.sqrt(hidden_size[1]),
        size=(hidden_size[1])),
                                        dtype=theano.config.floatX),
                             borrow=True)
    beta_HL = theano.shared(np.zeros((hidden_size[1]),
                                     dtype=theano.config.floatX),
                            borrow=True)
    bn_params_HL = [gamma_HL, beta_HL]
    bn_HL_output = batch_normalization(inputs=HL_layer_1.output,
                                       gamma=gamma_HL,
                                       beta=beta_HL,
                                       mean=HL_layer_1.output.mean(
                                           (0, ), keepdims=True),
                                       std=HL_layer_1.output.std(
                                           (0, ), keepdims=True),
                                       mode='low_mem')
    "Form input to LR classifier"
    LR_input = T.concatenate([HL_layer_1_input, bn_HL_output], axis=1)
    LR_input_size = HL_layer_1_input_size + hidden_size[1]
    U_a = create_ensemble_para(rng, 3, LR_input_size)  # (input_size, 3)
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    "Logistic Regression layer"
    layer_LR = LogisticRegression(
        rng,
        input=normalize_matrix_col_wise(LR_input),
        n_in=LR_input_size,
        n_out=3,
        W=U_a,
        b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [
        init_embeddings
    ] + NN_para + LR_para + bn_params + HL_layer_1_params + bn_params_HL
    cost = loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            if (epoch == 1 and iter % 1000 == 0) or (epoch >= 2
                                                     and iter % 5 == 0):
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                dev_error_sum = 0.0
                for dev_batch_id in dev_batch_start:  # for each test batch
                    dev_error_i = dev_model(
                        dev_sents_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_sents_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_labels_store[dev_batch_id:dev_batch_id +
                                         batch_size])

                    dev_error_sum += dev_error_i
                dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start))

                if dev_acc > max_acc_dev:
                    max_acc_dev = dev_acc
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev
                    '''
                    best dev model, test
                    '''
                    error_sum = 0.0
                    for test_batch_id in test_batch_start:  # for each test batch
                        error_i = test_model(
                            test_sents_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_sents_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_labels_store[test_batch_id:test_batch_id +
                                              batch_size])

                        error_sum += error_i
                    test_acc = 1.0 - error_sum / (len(test_batch_start))

                    if test_acc > max_acc_test:
                        max_acc_test = test_acc
                    print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test
                else:
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Example #15
0
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=20, cand_size=10,hidden_size=[300,300], max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, _, word2id  = load_fever_train(sent_len, claim_len, cand_size)
    test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names,test_ground_names,_, word2id = load_fever_dev(sent_len, claim_len, cand_size, word2id)

    train_sents=np.asarray(train_sents, dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    test_sents=np.asarray(test_sents, dtype='int32')

    train_sent_masks=np.asarray(train_sent_masks, dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_sent_masks=np.asarray(test_sent_masks, dtype=theano.config.floatX)

    train_sent_labels=np.asarray(train_sent_labels, dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sent_labels=np.asarray(test_sent_labels, dtype='int32')

    train_claims=np.asarray(train_claims, dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    test_claims=np.asarray(test_claims, dtype='int32')

    train_claim_mask=np.asarray(train_claim_mask, dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_claim_mask=np.asarray(test_claim_mask, dtype=theano.config.floatX)


    # train_labels_store=np.asarray(all_labels[0], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[2], dtype='int32')

    train_size=len(train_claims)
    # dev_size=len(dev_labels_store)
    test_size=len(test_claims)
    print 'train size: ', train_size, ' test size: ', test_size

    vocab_size=len(word2id)+1

    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable


    "now, start to build the input form of the model"
    sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    sents_mask=T.ftensor3()
    sents_labels=T.imatrix() #(batch, cand_size)
    claim_ids = T.imatrix() #(batch, claim_len)
    claim_mask = T.imatrix()
    # labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'


    embed_input_sents=init_embeddings[sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim=init_embeddings[claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1)



    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    # conv_W2, conv_b2=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]))
    NN_para = [conv_W, conv_b]
    conv_model_sents = Conv_with_Mask(rng, input_tensor3=embed_input_sents,
             mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])),
             image_shape=(batch_size*cand_size, 1, emb_size, sent_len),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings=conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape((batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(rng, input_tensor3=embed_input_claim,
             mask_matrix = claim_mask,
             image_shape=(batch_size, 1, emb_size, claim_len),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings=conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0,'x', 1), cand_size, axis=1)

    concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb ], axis=2)
    concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2))

    LR_input = concate_2_matrix#T.concatenate([sent_embeddings,sent_embeddings2], axis=1)
    LR_input_size = hidden_size[0]*2
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(rng, 1, LR_input_size) # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(concate_2_matrix.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix)
    loss = -T.mean(T.log(prob_pos))

    #
    # "Logistic Regression layer"
    # LR_input = T.concatenate([attentive_sent_embeddings_l,attentive_sent_embeddings_r,attentive_sent_embeddings_l+attentive_sent_embeddings_r,attentive_sent_embeddings_l*attentive_sent_embeddings_r],axis=1)
    # LR_input_size=4*hidden_size[0]
    #
    # U_a = create_ensemble_para(rng, 3, LR_input_size) # (input_size, 3)
    # LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    # LR_para=[U_a, LR_b]
    #
    # layer_LR=LogisticRegression(rng, input=normalize_matrix_col_wise(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    # loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    '''
    testing
    '''
    binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size



    params = [init_embeddings]+NN_para+LR_para
    cost=loss
    "Use AdaGrad to update parameters"
    updates =   Gradient_Cost_Para(cost,params, learning_rate)


    train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_ids,sents_mask,claim_ids,claim_mask], inter_matrix, allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]


    max_acc_dev=0.0
    max_test_f1=0.0

    cost_i=0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu=0

        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]
            '''
            train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask
            sents_ids,sents_mask,sents_labels,claim_ids,claim_mask
            '''
            cost_i+= train_model(
                                train_sents[train_id_batch],
                                train_sent_masks[train_id_batch],
                                train_sent_labels[train_id_batch],
                                train_claims[train_id_batch],
                                train_claim_mask[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()



                '''
                test
                test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask,
                sents_ids,sents_mask,claim_ids,claim_mask
                '''
                f1_sum=0.0
                for test_batch_id in test_batch_start: # for each test batch
                    batch_prob=test_model(
                            test_sents[test_batch_id:test_batch_id+batch_size],
                            test_sent_masks[test_batch_id:test_batch_id+batch_size],
                            test_claims[test_batch_id:test_batch_id+batch_size],
                            test_claim_mask[test_batch_id:test_batch_id+batch_size])

                    batch_sent_labels = test_sent_labels[test_batch_id:test_batch_id+batch_size]
                    batch_sent_names = test_sent_names[test_batch_id:test_batch_id+batch_size]
                    batch_ground_names = test_ground_names[test_batch_id:test_batch_id+batch_size]


                    for i in range(batch_size):
                        pred_sent_names = []
                        gold_sent_names = batch_ground_names[i]

                        zipped=[(batch_prob[i,k],batch_sent_labels[i][k],batch_sent_names[i][k]) for k in range(cand_size)]
                        sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True)
                        # print 'sorted_zip:', sorted_zip
                        # exit(0)
                        for j in range(cand_size):
                            triple = sorted_zip[j]
                            if triple[1] == 1.0:
                                '''
                                we should consider a rank, instead of binary
                                '''
                                if triple[0] >0.5:
                                    pred_sent_names.append(batch_sent_names[i][j])
                                    if len(pred_sent_names) == max_pred_pick:
                                        break
                        f1_i = compute_f1_two_list_names(pred_sent_names, gold_sent_names)
                        f1_sum+=f1_i


                test_f1=f1_sum/(len(test_batch_start)*batch_size)

                if test_f1 > max_test_f1:
                    max_test_f1=test_f1
                print '\t\tcurrent test_f1:', test_f1,' ; ','\t\t\t\t\tmax_test_f1:', max_test_f1



        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test