コード例 #1
0
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=100, filter_size=[3,3], maxSentLen=40, hidden_size=[300,300], max_term_len=4, p_mode = 'conc'):

    model_options = locals().copy()
    print "model options", model_options

    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results


    # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id  =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len)  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    word2id = load_word2id(root_dic+'LenciBenotto_word2id.pkl')
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id, group_size_list = load_task_hyper_vs_all_with_allDefComb(LenciBenotto_file,maxSentLen, word2id, wordlen=max_term_len)



    test_sents_l=np.asarray(test_sents_l, dtype='int32')

    test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX)


    test_sents_r=np.asarray(test_sents_r, dtype='int32')


    test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX)


    test_word1=np.asarray(test_word1, dtype='int32')
    test_word2=np.asarray(test_word2, dtype='int32')


    test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX)


    test_labels_store=np.asarray(test_labels, dtype='int32')

    test_extra=np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size=len(test_sents_l)
    print ' test size: ', test_size

    vocab_size=len(word2id)+1


    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
    # load_model_from_file(root_dic+'Weeds_best_para_init_embeddings', [init_embeddings])

    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1)
    word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1)


    '''create_AttentiveConv_params '''
    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]))
    conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para=[conv_W, conv_b,conv_W_context]

    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = word1_mask,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1)

    term_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_input_r,
             mask_matrix = word1_mask,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1)


    def_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = sents_mask_l,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1)


    def_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_input_r,
             mask_matrix = sents_mask_l,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1)

    if p_mode == 'conc':
        p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4*4*hidden_size[1]
    else:
        p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0)
        p_len =4*hidden_size[1]

    # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    # HL_input_size=p_len+1+1
    #
    # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh)


    "form input to LR classifier"
    LR_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    LR_input_size=p_len+1+1
    # LR_input = HL_layer_1.output
    # LR_input_size = hidden_size[1]

    U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a, LR_b]


    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b, bias=0.25) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.


    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()





    params = NN_para+LR_para #[init_embeddings]


    # load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params)
    load_model_from_file(root_dic+'LenciBenotto_best_para_0.557286573332', params)

    '''
    0.552587544259; current ap: 0.574037513126 ap@100 0.918481316424
    0.557286573332; current ap: 0.576498645289 ap@100 0.909032657538
    '''

    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    n_test_batches=test_size/batch_size
    n_test_remain = test_size%batch_size
    if n_test_remain!=0:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]
    else:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)



    # max_acc_dev=0.0
    max_ap_test=0.0
    max_ap_topk_test=0.0
    max_f1=0.0


    pred_labels =[]
    probs = []
    gold_labels =[]
    error_sum=0.0
    for idd, test_batch_id in enumerate(test_batch_start): # for each test batch
        pred_i, prob_i=test_model(
                test_sents_l[test_batch_id:test_batch_id+batch_size],
                test_masks_l[test_batch_id:test_batch_id+batch_size],
                test_sents_r[test_batch_id:test_batch_id+batch_size],
                test_masks_r[test_batch_id:test_batch_id+batch_size],
                test_word1[test_batch_id:test_batch_id+batch_size],
                test_word2[test_batch_id:test_batch_id+batch_size],
                test_word1_mask[test_batch_id:test_batch_id+batch_size],
                test_word2_mask[test_batch_id:test_batch_id+batch_size],
                test_extra[test_batch_id:test_batch_id+batch_size])

        # error_sum+=error_i
        pred_labels+=list(pred_i)
        probs+=list(prob_i)

    print len(test_sents_l), len(probs)
    if n_test_remain !=0:
        probs = probs[:(len(test_batch_start)-1)*batch_size]+probs[-n_test_remain:]
    print len(test_sents_l), len(probs)
    assert len(test_sents_l) == len(probs)
    assert sum(group_size_list) == len(probs)
    #max prob in group
    max_probs = []
    prior_size = 0
    for i in range(len(group_size_list)):

        sub_probs = probs[prior_size:prior_size+group_size_list[i]]
        prior_size += group_size_list[i]
        max_probs.append(max(sub_probs))

    print len(group_size_list),len(max_probs),len(test_labels)
    assert len(test_labels) == len(max_probs)
    # test_acc=1.0-error_sum/(len(test_batch_start))
    test_ap = apk(test_labels, max_probs, k=len(test_labels))
    test_ap_top100 = apk(test_labels, max_probs, k=100)


    # if test_ap > max_ap_test:
    #     max_ap_test=test_ap
    #     store_model_to_file('/save/wenpeng/datasets/EVALution/HyperDef_label_4ways_conc_test_on_EVA_allDefComb_best_para_'+str(max_ap_test), params)
    # if test_ap_top100 > max_ap_topk_test:
    #     max_ap_topk_test=test_ap_top100
    print '\t\tcurrent ap:', test_ap,'ap@100', test_ap_top100
コード例 #2
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=10,
                    filter_size=[3, 5],
                    maxSentLen=40,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    NN_para = [conv_W, conv_b, conv_W2, conv_b2]
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2], axis=1)
    LR_input_size = hidden_size[0] * 2
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 12, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))

    # loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [embeddings
              ] + NN_para + LR_para  # put all model parameters together
    cost = loss  #+Div_reg*diversify_reg#+L2_weight*L2_reg
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''
    binarize_prob = T.where(score_matrix > 0.5, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_id_matrix, sents_mask, labels],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask],
                                 binarize_prob,
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0
        cost_i = 0.0
        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size])
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]

                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
コード例 #3
0
                for word in sentence_wordlist_r:
                    if word in vocab or word in glove_vocab:
                        norm_r.append(word)
                    else:
                        if len(word)>0:
                            expected_match = rareword2match.get(word)
                            if expected_match is None:
                                expected_match = find_top_similar_vocab(word, l_vocab, overlap_vocab)
                                
                                rareword2match[word] = expected_match
                            norm_r.append(expected_match)
                writefile.write(' '.join(norm_l)+'\t'+' '.join(norm_r)+'\n')
#             line_co+=1
#             if line_co%1==0:
#                 print line_co,' ...'
        writefile.close()
        print 'file norm over'
        readfile.close()



if __name__ == '__main__':
    word2vec=load_word2vec()
    vocab=set(word2vec.keys())
    glove2vec=load_word2vec_file('glove.6B.300d.txt')
    glove_vocab=set(glove2vec.keys())
    overlap_vocab = vocab&glove_vocab
    handle_file(['test', 'train','dev'], vocab,glove_vocab, overlap_vocab)
#     lit='we are your-s   most famous-start . '
#     print re.split('\s+|-',lit)
コード例 #4
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=0.0000001,
                    extra_size=4,
                    drop_p=0.0,
                    div_weight=0.00001,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    comment='only one HL layer, drop0.0'):

    model_options = locals().copy()
    print "model options", model_options

    first_seeds = [1234, 1235, 1236, 1237]  #first copy starts by 1
    first_rngs = [
        np.random.RandomState(first_seeds[0]),
        np.random.RandomState(first_seeds[1]),
        np.random.RandomState(first_seeds[2]),
        np.random.RandomState(first_seeds[3])
    ]  #random seed, control the model generates the same results
    first_srng = RandomStreams(first_rngs[0].randint(999999))

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_extra, all_labels, word2id, test_rows = load_SNLI_dataset_with_extra_with_test(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    #     train_sents_l = np.concatenate((train_sents_l, dev_sents_l), axis=0)
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    #     train_masks_l = np.concatenate((train_masks_l, dev_masks_l), axis=0)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    #     train_sents_r = np.concatenate((train_sents_r, dev_sents_r), axis=0)
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    #     train_masks_r = np.concatenate((train_masks_r, dev_masks_r), axis=0)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_extra = np.asarray(all_extra[0], dtype=theano.config.floatX)
    dev_extra = np.asarray(all_extra[1], dtype=theano.config.floatX)
    test_extra = np.asarray(all_extra[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    #     train_labels_store = np.concatenate((train_labels_store, dev_labels_store), axis=0)
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = first_rngs[0].normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    #     word2vec =extend_word2vec_lowercase(word2vec)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    first_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    train_flag = T.iscalar()
    first_sents_ids_l = T.imatrix()
    first_sents_mask_l = T.fmatrix()
    first_sents_ids_r = T.imatrix()
    first_sents_mask_r = T.fmatrix()
    first_labels = T.ivector()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def common_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    first_common_input_l = dropout_layer(
        first_srng, common_input(first_embeddings,
                                 first_sents_ids_l), drop_p, train_flag
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    first_common_input_r = dropout_layer(
        first_srng, common_input(first_embeddings,
                                 first_sents_ids_r), drop_p, train_flag
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    gate_filter_shape = (hidden_size[0], 1, emb_size, 1)

    def create_CNN_params(rng):
        conv_W_2_pre, conv_b_2_pre = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2_gate, conv_b_2_gate = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2, conv_b_2 = create_conv_para(rng,
                                              filter_shape=(hidden_size[1], 1,
                                                            hidden_size[0],
                                                            filter_size[0]))
        conv_W_2_context, conv_b_2_context = create_conv_para(
            rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))
        return conv_W_2_pre, conv_b_2_pre, conv_W_2_gate, conv_b_2_gate, conv_W_2, conv_b_2, conv_W_2_context, conv_b_2_context

    first_conv_W_pre, first_conv_b_pre, first_conv_W_gate, first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context, first_conv_b_context = create_CNN_params(
        first_rngs[0])
    first_2_conv_W_pre, first_2_conv_b_pre, first_2_conv_W_gate, first_2_conv_b_gate, first_2_conv_W, first_2_conv_b, first_2_conv_W_context, first_2_conv_b_context = create_CNN_params(
        first_rngs[1])
    first_3_conv_W_pre, first_3_conv_b_pre, first_3_conv_W_gate, first_3_conv_b_gate, first_3_conv_W, first_3_conv_b, first_3_conv_W_context, first_3_conv_b_context = create_CNN_params(
        first_rngs[2])
    first_4_conv_W_pre, first_4_conv_b_pre, first_4_conv_W_gate, first_4_conv_b_gate, first_4_conv_W, first_4_conv_b, first_4_conv_W_context, first_4_conv_b_context = create_CNN_params(
        first_rngs[3])
    # first_5_conv_W_pre, first_5_conv_b_pre,first_5_conv_W_gate, first_5_conv_b_gate,first_5_conv_W, first_5_conv_b,first_5_conv_W_context, first_5_conv_b_context = create_CNN_params(first_rng)
    # first_6_conv_W_pre, first_6_conv_b_pre,first_6_conv_W_gate, first_6_conv_b_gate,first_6_conv_W, first_6_conv_b,first_6_conv_W_context, first_6_conv_b_context = create_CNN_params(first_rng)

    first_CNN_1_para = [
        first_conv_W_pre, first_conv_b_pre, first_conv_W_gate,
        first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context
    ]
    first_CNN_2_para = [
        first_2_conv_W_pre, first_2_conv_b_pre, first_2_conv_W_gate,
        first_2_conv_b_gate, first_2_conv_W, first_2_conv_b,
        first_2_conv_W_context
    ]
    first_CNN_3_para = [
        first_3_conv_W_pre, first_3_conv_b_pre, first_3_conv_W_gate,
        first_3_conv_b_gate, first_3_conv_W, first_3_conv_b,
        first_3_conv_W_context
    ]
    first_CNN_4_para = [
        first_4_conv_W_pre, first_4_conv_b_pre, first_4_conv_W_gate,
        first_4_conv_b_gate, first_4_conv_W, first_4_conv_b,
        first_4_conv_W_context
    ]
    '''
    first copy
    '''

    def copy(
        rngs,
        common_input_l,
        common_input_r,
        sents_mask_l,
        sents_mask_r,
        drop_conv_W_1_pre,
        conv_b_1_pre,
        drop_conv_W_1_gate,
        conv_b_1_gate,
        drop_conv_W_1,
        conv_b_1,
        drop_conv_W_1_context,
        conv_b_1_context,
        drop_conv_W_1_pre_2,
        conv_b_1_pre_2,
        drop_conv_W_1_gate_2,
        conv_b_1_gate_2,
        drop_conv_W_1_2,
        conv_b_1_2,
        drop_conv_W_1_context_2,
        conv_b_1_context_2,
        drop_conv_W_1_pre_3,
        conv_b_1_pre_3,
        drop_conv_W_1_gate_3,
        conv_b_1_gate_3,
        drop_conv_W_1_3,
        conv_b_1_3,
        drop_conv_W_1_context_3,
        conv_b_1_context_3,
        drop_conv_W_1_pre_4,
        conv_b_1_pre_4,
        drop_conv_W_1_gate_4,
        conv_b_1_gate_4,
        drop_conv_W_1_4,
        conv_b_1_4,
        drop_conv_W_1_context_4,
        conv_b_1_context_4,

        #    drop_conv_W_1_pre_5,conv_b_1_pre_5,drop_conv_W_1_gate_5,conv_b_1_gate_5,
        #    drop_conv_W_1_5,conv_b_1_5,drop_conv_W_1_context_5, conv_b_1_context_5,
        #
        #    drop_conv_W_1_pre_6,conv_b_1_pre_6,drop_conv_W_1_gate_6,conv_b_1_gate_6,
        #    drop_conv_W_1_6,conv_b_1_6,drop_conv_W_1_context_6, conv_b_1_context_6,
        labels):

        loss_0_0, distr_0_0, params_0_0 = one_classifier_in_one_copy(
            rngs[0], common_input_l, common_input_r, sents_mask_l,
            sents_mask_r, batch_size, emb_size, maxSentLen, gate_filter_shape,
            hidden_size, filter_size, first_srng, drop_p, train_flag, labels,
            drop_conv_W_1_pre, conv_b_1_pre, drop_conv_W_1_gate, conv_b_1_gate,
            drop_conv_W_1, conv_b_1, drop_conv_W_1_context, conv_b_1_context,
            True)
        loss_0_1, distr_0_1, params_0_1 = one_classifier_in_one_copy(
            rngs[1], common_input_l, common_input_r, sents_mask_l,
            sents_mask_r, batch_size, emb_size, maxSentLen, gate_filter_shape,
            hidden_size, filter_size, first_srng, drop_p, train_flag, labels,
            drop_conv_W_1_pre_2, conv_b_1_pre_2, drop_conv_W_1_gate_2,
            conv_b_1_gate_2, drop_conv_W_1_2, conv_b_1_2,
            drop_conv_W_1_context_2, conv_b_1_context_2, False)
        loss_0_2, distr_0_2, params_0_2 = one_classifier_in_one_copy(
            rngs[2], common_input_l, common_input_r, sents_mask_l,
            sents_mask_r, batch_size, emb_size, maxSentLen, gate_filter_shape,
            hidden_size, filter_size, first_srng, drop_p, train_flag, labels,
            drop_conv_W_1_pre_3, conv_b_1_pre_3, drop_conv_W_1_gate_3,
            conv_b_1_gate_3, drop_conv_W_1_3, conv_b_1_3,
            drop_conv_W_1_context_3, conv_b_1_context_3, True)
        loss_0_3, distr_0_3, params_0_3 = one_classifier_in_one_copy(
            rngs[3], common_input_l, common_input_r, sents_mask_l,
            sents_mask_r, batch_size, emb_size, maxSentLen, gate_filter_shape,
            hidden_size, filter_size, first_srng, drop_p, train_flag, labels,
            drop_conv_W_1_pre_4, conv_b_1_pre_4, drop_conv_W_1_gate_4,
            conv_b_1_gate_4, drop_conv_W_1_4, conv_b_1_4,
            drop_conv_W_1_context_4, conv_b_1_context_4, False)
        # loss_0_4, distr_0_4, params_0_4 = one_classifier_in_one_copy(rng, common_input_l,common_input_r,sents_mask_l,sents_mask_r,batch_size, emb_size,
        #                                                              maxSentLen,gate_filter_shape,hidden_size,filter_size,
        #                        first_srng, drop_p,train_flag,labels,
        #                        drop_conv_W_1_pre_5,conv_b_1_pre_5,drop_conv_W_1_gate_5,conv_b_1_gate_5,
        #                        drop_conv_W_1_5,conv_b_1_5,drop_conv_W_1_context_5,conv_b_1_context_5,
        #                        True)
        # loss_0_5, distr_0_5, params_0_5 = one_classifier_in_one_copy(rng, common_input_l,common_input_r,sents_mask_l,sents_mask_r,batch_size, emb_size,
        #                                                              maxSentLen,gate_filter_shape,hidden_size,filter_size,
        #                        first_srng, drop_p,train_flag,labels,
        #                        drop_conv_W_1_pre_6,conv_b_1_pre_6,drop_conv_W_1_gate_6,conv_b_1_gate_6,
        #                        drop_conv_W_1_6,conv_b_1_6,drop_conv_W_1_context_6,conv_b_1_context_6,
        #                        False)

        #         psp_label = T.repeat(labels, multi_psp_size)

        loss_0 = [(loss_0_0 + loss_0_1 + loss_0_2 + loss_0_3) / 4.0, loss_0_0,
                  loss_0_1, loss_0_2, loss_0_3]  #5
        para_0 = [params_0_0, params_0_1, params_0_2, params_0_3]

        #         loss = loss_0+loss_1+loss_2
        batch_distr = distr_0_0 + distr_0_1 + distr_0_2 + distr_0_3  #T.sum((layer_LR.p_y_given_x).reshape((batch_size, multi_psp_size,3)), axis=1)  #(batch, 3)

        return loss_0, para_0, batch_distr

    first_loss, first_classifier_params, first_test_distr = copy(
        first_rngs,
        first_common_input_l,
        first_common_input_r,
        first_sents_mask_l,
        first_sents_mask_r,
        first_conv_W_pre,
        first_conv_b_pre,
        first_conv_W_gate,
        first_conv_b_gate,
        first_conv_W,
        first_conv_b,
        first_conv_W_context,
        first_conv_b_context,
        first_2_conv_W_pre,
        first_2_conv_b_pre,
        first_2_conv_W_gate,
        first_2_conv_b_gate,
        first_2_conv_W,
        first_2_conv_b,
        first_2_conv_W_context,
        first_2_conv_b_context,
        first_3_conv_W_pre,
        first_3_conv_b_pre,
        first_3_conv_W_gate,
        first_3_conv_b_gate,
        first_3_conv_W,
        first_3_conv_b,
        first_3_conv_W_context,
        first_3_conv_b_context,
        first_4_conv_W_pre,
        first_4_conv_b_pre,
        first_4_conv_W_gate,
        first_4_conv_b_gate,
        first_4_conv_W,
        first_4_conv_b,
        first_4_conv_W_context,
        first_4_conv_b_context,

        # drop_first_5_conv_W_pre,first_5_conv_b_pre,drop_first_5_conv_W_gate,first_5_conv_b_gate,
        # drop_first_5_conv_W,first_5_conv_b,drop_first_5_conv_W_context,first_5_conv_b_context,
        #
        # drop_first_6_conv_W_pre,first_6_conv_b_pre,drop_first_6_conv_W_gate,first_6_conv_b_gate,
        # drop_first_6_conv_W,first_6_conv_b,drop_first_6_conv_W_context,first_6_conv_b_context,
        first_labels)

    first_preds = T.argmax(first_test_distr, axis=1)  #batch
    all_error = T.mean(T.neq(first_preds, first_labels))

    #     neg_labels = T.where( labels < 2, 2, labels-1)
    #     loss2=-T.mean(T.log(1.0/(1.0+layer_LR.p_y_given_x))[T.arange(neg_labels.shape[0]), neg_labels])

    # rank loss
    # entail_prob_batch = T.nnet.softmax(layer_LR.before_softmax.T)[2] #batch
    # entail_ids = elementwise_is_two(labels)
    # entail_probs = entail_prob_batch[entail_ids.nonzero()]
    # non_entail_probs = entail_prob_batch[(1-entail_ids).nonzero()]
    #
    # repeat_entail = T.extra_ops.repeat(entail_probs, non_entail_probs.shape[0], axis=0)
    # repeat_non_entail = T.extra_ops.repeat(non_entail_probs.dimshuffle('x',0), entail_probs.shape[0], axis=0).flatten()
    # loss2 = -T.mean(T.log(entail_probs))#T.mean(T.maximum(0.0, margin-repeat_entail+repeat_non_entail))

    # zero_matrix = T.zeros((batch_size, 3))
    # filled_zero_matrix = T.set_subtensor(zero_matrix[T.arange(batch_size), labels], 1.0)
    # prob_batch_posi = layer_LR.p_y_given_x[filled_zero_matrix.nonzero()]
    # prob_batch_nega = layer_LR.p_y_given_x[(1-filled_zero_matrix).nonzero()]
    #
    # repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0)
    # repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten()
    # loss2 = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega))

    first_common_para = [first_embeddings]
    first_classifier_1_para = first_CNN_1_para + first_classifier_params[0]
    first_classifier_2_para = first_CNN_2_para + first_classifier_params[1]
    first_classifier_3_para = first_CNN_3_para + first_classifier_params[2]
    first_classifier_4_para = first_CNN_4_para + first_classifier_params[3]

    first_common_updates = Gradient_Cost_Para(first_loss[0], first_common_para,
                                              learning_rate)
    first_classifier_1_updates = Gradient_Cost_Para(first_loss[1],
                                                    first_classifier_1_para,
                                                    learning_rate)
    first_classifier_2_updates = Gradient_Cost_Para(first_loss[2],
                                                    first_classifier_2_para,
                                                    learning_rate)
    first_classifier_3_updates = Gradient_Cost_Para(first_loss[3],
                                                    first_classifier_3_para,
                                                    learning_rate)
    first_classifier_4_updates = Gradient_Cost_Para(first_loss[4],
                                                    first_classifier_4_para,
                                                    learning_rate)

    cost = first_loss[0]

    first_updates = first_common_updates + first_classifier_1_updates + first_classifier_2_updates + first_classifier_3_updates + first_classifier_4_updates

    updates = first_updates

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([
        train_flag,
        first_sents_ids_l,
        first_sents_mask_l,
        first_sents_ids_r,
        first_sents_mask_r,
        first_labels,
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    #     train_model_pred = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra,labels,
    #                                         second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], [LR_input, labels], allow_input_downcast=True, on_unused_input='ignore')
    #
    #     dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra, labels,
    #                                  second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([
        train_flag,
        first_sents_ids_l,
        first_sents_mask_l,
        first_sents_ids_r,
        first_sents_mask_r,
        first_labels,
    ], [all_error, first_preds],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    gold_test_rows = test_rows[:(n_test_batches *
                                 batch_size)] + test_rows[-batch_size:]

    max_acc_test = 0.0

    cost_i = 0.0
    first_train_indices = range(train_size)
    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(200).shuffle(
            first_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            first_train_id_batch = first_train_indices[batch_id:batch_id +
                                                       batch_size]
            cost_i += train_model(1, train_sents_l[first_train_id_batch],
                                  train_masks_l[first_train_id_batch],
                                  train_sents_r[first_train_id_batch],
                                  train_masks_r[first_train_id_batch],
                                  train_labels_store[first_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % int(2000 * (50.0 / batch_size)) == 0:
                #             if iter%int(200*(50.0 / batch_size))==0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0:
                #     print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                #     past_time = time.time()

                #                 error_sum=0.0
                #                 for dev_batch_id in dev_batch_start: # for each test batch
                #                     error_i=dev_model(
                #                                 dev_sents_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_sents_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_labels_store[dev_batch_id:dev_batch_id+batch_size]
                #                                 )
                #
                #                     error_sum+=error_i
                #                 dev_accuracy=1.0-error_sum/(len(dev_batch_start))
                #                 if dev_accuracy > max_acc_dev:
                #                     max_acc_dev=dev_accuracy
                #                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev
                #best dev model, do test
                #                 error_sum_1=0.0
                #                 error_sum_2=0.0
                error_sum_comb = 0.0
                pred_ys = []
                for test_batch_id in test_batch_start:  # for each test batch
                    error_comb, pred_ys_batch = test_model(
                        0,
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    #                     error_sum_1+=error_1
                    #                     error_sum_2+=error_2
                    error_sum_comb += error_comb
                    pred_ys += list(pred_ys_batch)

#                 test_acc_1=1.0-error_sum_1/(len(test_batch_start))
#                 test_acc_2=1.0-error_sum_2/(len(test_batch_start))
                test_acc_comb = 1.0 - error_sum_comb / (len(test_batch_start))

                #                 if test_acc_1 > max_acc_test:
                #                     max_acc_test=test_acc_1
                #                 if test_acc_2 > max_acc_test:
                #                     max_acc_test=test_acc_2
                if test_acc_comb > max_acc_test:
                    max_acc_test = test_acc_comb
                    #                     store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_single_model_'+str(max_acc_test), params)

                    if len(pred_ys) != len(gold_test_rows):
                        print 'len(pred_ys)!=len(gold_test_rows):', len(
                            pred_ys), len(gold_test_rows)
                    else:
                        test_write = open(
                            '/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/error_analysis_'
                            + str(max_acc_test) + '.txt', 'w')
                        for i in range(len(pred_ys)):
                            test_write.write(
                                str(pred_ys[i]) + '\t' + gold_test_rows[i] +
                                '\n')
                        print 'error analysis file written over.'
                        test_write.close()

                print '\t\tcurrent acc:', test_acc_comb, '\t\t\t\t\tmax_acc:', max_acc_test


#                 else:
#                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
コード例 #5
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=1e-5,
                    extra_size=4,
                    emb_size=300,
                    batch_size=100,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    max_term_len=4,
                    p_mode='conc'):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1, all_word2, all_word1_mask, all_word2_mask, all_labels, all_extra, word2id = load_wordnet_hyper_vs_all_with_words(
        maxlen=maxSentLen, wordlen=max_term_len
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1, test_word2, test_word1_mask, test_word2_mask, test_labels, test_extra, word2id = load_task_hyper_vs_all_with_words(
        LenciBenotto_file, maxSentLen, word2id, wordlen=max_term_len)
    store_word2id(word2id, root_dic + 'LenciBenotto_word2id.pkl')
    # exit(0)
    total_size = len(all_sentences_l)
    hold_test_size = 10000
    train_size = total_size - hold_test_size

    train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_word1 = np.asarray(all_word1[:train_size], dtype='int32')
    train_word2 = np.asarray(all_word2[:train_size], dtype='int32')
    test_word1 = np.asarray(test_word1, dtype='int32')
    test_word2 = np.asarray(test_word2, dtype='int32')

    train_word1_mask = np.asarray(all_word1_mask[:train_size],
                                  dtype=theano.config.floatX)
    train_word2_mask = np.asarray(all_word2_mask[:train_size],
                                  dtype=theano.config.floatX)
    test_word1_mask = np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask = np.asarray(test_word2_mask, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[:train_size], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    train_extra = np.asarray(all_extra[:train_size],
                             dtype=theano.config.floatX)
    test_extra = np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable
    store_model_to_file(root_dic + 'LenciBenotto_best_para_init_embeddings',
                        [init_embeddings])

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape(
        (batch_size, word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape(
        (batch_size, word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1 * word1_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    word2_embedding = T.sum(embed_word2 * word2_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=word1_mask,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([
        tt_embeddings_l, tt_embeddings_r, tt_embeddings_l * tt_embeddings_r,
        tt_embeddings_l - tt_embeddings_r
    ],
                         axis=1)

    term_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_input_r,
        mask_matrix=word1_mask,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([
        td_embeddings_l, td_embeddings_r, td_embeddings_l * td_embeddings_r,
        td_embeddings_l - td_embeddings_r
    ],
                         axis=1)

    def_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=sents_mask_l,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([
        dt_embeddings_l, dt_embeddings_r, dt_embeddings_l * dt_embeddings_r,
        dt_embeddings_l - dt_embeddings_r
    ],
                         axis=1)

    def_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([
        dd_embeddings_l, dd_embeddings_r, dd_embeddings_l * dd_embeddings_r,
        dd_embeddings_l - dd_embeddings_r
    ],
                         axis=1)

    if p_mode == 'conc':
        p = T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4 * 4 * hidden_size[1]
    else:
        p = T.max(T.concatenate([
            p_ww.dimshuffle('x', 0, 1),
            p_wd.dimshuffle('x', 0, 1),
            p_dw.dimshuffle('x', 0, 1),
            p_dd.dimshuffle('x', 0, 1)
        ],
                                axis=0),
                  axis=0)
        p_len = 4 * hidden_size[1]

    # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    # HL_input_size=p_len+1+1
    #
    # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh)

    "form input to LR classifier"
    LR_input = T.concatenate([
        p,
        cosine_matrix1_matrix2_rowwise(word1_embedding,
                                       word2_embedding).dimshuffle(0, 'x'),
        extra.dimshuffle(0, 'x')
    ],
                             axis=1)
    LR_input_size = p_len + 1 + 1
    # LR_input = HL_layer_1.output
    # LR_input_size = hidden_size[1]

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng,
        input=LR_input,
        n_in=LR_input_size,
        n_out=2,
        W=U_a,
        b=LR_b,
        bias=0.25
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()

    params = NN_para + LR_para  #[init_embeddings]

    cost = loss  #+L2_weight*L2_reg

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra, labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    test_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra, labels
    ], [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    if n_test_remain != 0:
        test_batch_start = list(
            np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    else:
        test_batch_start = list(np.arange(n_test_batches) * batch_size)

    # max_acc_dev=0.0
    max_ap_test = 0.0
    max_ap_topk_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(
                train_sents_l[train_id_batch], train_masks_l[train_id_batch],
                train_sents_r[train_id_batch], train_masks_r[train_id_batch],
                train_word1[train_id_batch], train_word2[train_id_batch],
                train_word1_mask[train_id_batch],
                train_word2_mask[train_id_batch], train_extra[train_id_batch],
                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                probs = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    error_i, pred_i, prob_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_word1[test_batch_id:test_batch_id + batch_size],
                        test_word2[test_batch_id:test_batch_id + batch_size],
                        test_word1_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_word2_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_extra[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                    pred_labels += list(pred_i)
                    probs += list(prob_i)
                if n_test_remain != 0:
                    probs = probs[:(len(test_batch_start) - 1) *
                                  batch_size] + probs[-n_test_remain:]
                assert len(test_labels) == len(probs)
                # test_acc=1.0-error_sum/(len(test_batch_start))
                test_ap = apk(test_labels, probs, k=len(test_labels))
                test_ap_top100 = apk(test_labels, probs, k=100)

                if test_ap > max_ap_test:
                    max_ap_test = test_ap
                    store_model_to_file(
                        root_dic + 'LenciBenotto_best_para_' +
                        str(max_ap_test), params)
                if test_ap_top100 > max_ap_topk_test:
                    max_ap_topk_test = test_ap_top100
                print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #6
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=10,
                    L2_weight=0.000001,
                    extra_size=4,
                    emb_size=300,
                    posi_emb_size=50,
                    batch_size=50,
                    filter_size=[3, 3],
                    maxSentLen=50,
                    hidden_size=300):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SciTailV1_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)

    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    posi_rand_values = rng.normal(
        0.0, 0.01,
        (maxSentLen,
         posi_emb_size))  #generate a matrix by Gaussian distribution
    posi_embeddings = theano.shared(
        value=np.array(posi_rand_values, dtype=theano.config.floatX),
        borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size, 1, emb_size,
                                                    filter_size[0]))
    conv_W_posi, conv_b_posi = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1, emb_size + posi_emb_size,
                      filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size, 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_posi, conv_b_posi, conv_W_context]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size, 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size, 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_posi=conv_W_posi,
        b_posi=conv_b_posi,
        W_context=conv_W_context,
        b_context=conv_b_context,
        posi_emb_matrix=posi_embeddings,
        posi_emb_size=posi_emb_size)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    sent_embeddings_l = attentive_conv_layer.maxpool_vec_l
    sent_embeddings_r = attentive_conv_layer.maxpool_vec_r

    "form input to LR classifier"
    LR_input = T.concatenate([
        sent_embeddings_l, sent_embeddings_r,
        sent_embeddings_l * sent_embeddings_r, attentive_sent_embeddings_l,
        attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = 6 * hidden_size

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [init_embeddings, posi_embeddings] + NN_para + LR_para
    # L2_reg = (init_embeddings**2).sum()+(conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()

    cost = loss  #+L2_weight*L2_reg

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                dev_error_sum = 0.0
                for dev_batch_id in dev_batch_start:  # for each test batch
                    dev_error_i = dev_model(
                        dev_sents_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_sents_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_labels_store[dev_batch_id:dev_batch_id +
                                         batch_size])

                    dev_error_sum += dev_error_i
                dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start))

                if dev_acc > max_acc_dev:
                    max_acc_dev = dev_acc
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

                    error_sum = 0.0
                    for idd, test_batch_id in enumerate(
                            test_batch_start):  # for each test batch
                        error_i = test_model(
                            test_sents_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_sents_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_labels_store[test_batch_id:test_batch_id +
                                              batch_size])

                        error_sum += error_i
                    test_acc = 1.0 - error_sum / (len(test_batch_start))
                    if test_acc > max_acc_test:
                        max_acc_test = test_acc
                        store_model_to_file(
                            '/home/wenpeng/workspace/SciTail/src/model_para_' +
                            str(max_acc_test), params)
                    print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test
                else:
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
コード例 #7
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=0.0000001,
                    extra_size=4,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 5],
                    maxSentLen=60,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id = load_NYT_dataset(
        maxSentLen, word2id)

    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    hidden_size[0],
                                                    filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))

    conv_W_2, conv_b_2 = create_conv_para(rng,
                                          filter_shape=(hidden_size[1], 1,
                                                        hidden_size[0],
                                                        filter_size[1]))
    conv_W_context_2, conv_b_context_2 = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))

    NN_para = [
        conv_W, conv_b, conv_W_context, conv_W_2, conv_b_2, conv_W_context_2
    ]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
        image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen),
        filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    attentive_conv_layer_2 = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
        image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen),
        filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[1]),
        filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1),
        W=conv_W_2,
        b=conv_b_2,
        W_context=conv_W_context_2,
        b_context=conv_b_context_2)

    attentive_sent_embeddings_l_2 = attentive_conv_layer_2.attentive_maxpool_vec_l
    attentive_sent_embeddings_r_2 = attentive_conv_layer_2.attentive_maxpool_vec_r

    #form input to HL layers
    HL_layer_1_input = T.concatenate([
        attentive_sent_embeddings_l, attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r,
        attentive_sent_embeddings_l_2, attentive_sent_embeddings_r_2,
        attentive_sent_embeddings_l_2 * attentive_sent_embeddings_r_2
    ],
                                     axis=1)

    HL_layer_1_input_size = 6 * hidden_size[1]

    HL_layer_1 = HiddenLayer(rng,
                             input=HL_layer_1_input,
                             n_in=HL_layer_1_input_size,
                             n_out=hidden_size[1],
                             activation=T.nnet.relu)
    HL_layer_2 = HiddenLayer(rng,
                             input=HL_layer_1.output,
                             n_in=hidden_size[1],
                             n_out=hidden_size[1],
                             activation=T.nnet.relu)

    # LR_input_size=HL_layer_1_input_size+2*hidden_size[0]
    "form input to LR classifier"
    LR_input = T.tanh(
        T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],
                      axis=1))
    LR_input_size = HL_layer_1_input_size + 2 * hidden_size[1]

    U_a = create_ensemble_para(
        rng, 3, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''

    test_preds = T.argmax(layer_LR.p_y_given_x, axis=1)
    transfered_preds = T.eq(test_preds, 2)
    test_error = T.mean(T.neq(transfered_preds, labels))

    params = [init_embeddings
              ] + NN_para + HL_layer_1.params + HL_layer_2.params + LR_para

    cost = loss

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        [test_error, transfered_preds],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    error_i, pred_labels_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                    pred_labels += list(pred_labels_i)
                    gold_labels += list(
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])
                test_acc = 1.0 - error_sum / (len(test_batch_start))
                test_f1 = f1_score_2_binary_list(
                    gold_labels, pred_labels)  #, average='binary')

                if test_acc > max_acc_test:
                    max_acc_test = test_acc
                if test_f1 > max_f1:
                    max_f1 = test_f1

                    # store_model_to_file('/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_five_copies_'+str(max_acc_test), params)

                print '\t\tcurrent acc:', test_acc, ' ; ', '\t\tmax_acc:', max_acc_test, '\t\t test_f1:', test_f1, '\t\tmax F1:', max_f1

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
コード例 #8
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=70,
                    filter_size=[3],
                    maxSentLen=40,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered"
    "words keep random initialization"
    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in'
    'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) '
    embed_input_l = init_embeddings[sents_ids_l.flatten(
    )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)
    '''create parameters for attentive convolution function '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    "Logistic Regression layer"
    LR_input = T.concatenate([
        attentive_sent_embeddings_l, attentive_sent_embeddings_r,
        attentive_sent_embeddings_l + attentive_sent_embeddings_r,
        attentive_sent_embeddings_l * attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = 4 * hidden_size[0]

    U_a = create_ensemble_para(rng, 3, LR_input_size)  # (input_size, 3)
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng,
        input=normalize_matrix_col_wise(LR_input),
        n_in=LR_input_size,
        n_out=3,
        W=U_a,
        b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [init_embeddings] + NN_para + LR_para
    cost = loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 1000 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                '''
                test
                '''
                error_sum = 0.0
                for test_batch_id in test_batch_start:  # for each test batch
                    error_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                test_acc = 1.0 - error_sum / (len(test_batch_start))

                if test_acc > max_acc_test:
                    max_acc_test = test_acc
                print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
コード例 #9
0
def evaluate_lenet5(learning_rate=0.005,
                    n_epochs=4,
                    emb_size=300,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il9il10-eng/il9il10-eng-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il9il10-eng/il9il10-eng_system_output_epoch4.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_lines, word2id = load_official_testData(
        word2id, maxSentLen, test_file_path)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    # test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        [binarize_prob, ensemble_scores, sum_tensor3],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    # max_meanf1_test=0.0
    # max_weightf1_test=0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_types = []
                pred_confs = []
                pred_others = []
                for i, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    pred_types_i, pred_conf_i, pred_fields_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    if i < len(test_batch_start) - 1:
                        pred_types.append(pred_types_i)
                        pred_confs.append(pred_conf_i)
                        pred_others.append(pred_fields_i)
                    else:
                        pred_types.append(pred_types_i[-n_test_remain:])
                        pred_confs.append(pred_conf_i[-n_test_remain:])
                        pred_others.append(pred_fields_i[-n_test_remain:])
                pred_types = np.concatenate(pred_types, axis=0)
                pred_confs = np.concatenate(pred_confs, axis=0)
                pred_others = np.concatenate(pred_others, axis=0)
                mean_frame = generate_2018_official_output_english(
                    test_lines, output_file_path, pred_types, pred_confs,
                    pred_others, min_mean_frame)
                if mean_frame < min_mean_frame:
                    min_mean_frame = mean_frame
                print '\t\t\t test  over, min_mean_frame:', min_mean_frame

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #10
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=0.0000001,
                    extra_size=4,
                    use_svm=False,
                    drop_p=0.1,
                    div_weight=0.00001,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    margin=0.1,
                    comment='five copies, sum&majority'):

    model_options = locals().copy()
    print "model options", model_options

    first_seed = 1234
    np.random.seed(first_seed)
    first_rng = np.random.RandomState(
        first_seed)  #random seed, control the model generates the same results
    first_srng = RandomStreams(first_rng.randint(999999))

    second_seed = 2345
    np.random.seed(second_seed)
    second_rng = np.random.RandomState(
        second_seed
    )  #random seed, control the model generates the same results
    second_srng = RandomStreams(second_rng.randint(888888))

    third_seed = 3456
    np.random.seed(third_seed)
    third_rng = np.random.RandomState(
        third_seed)  #random seed, control the model generates the same results
    third_srng = RandomStreams(third_rng.randint(777777))

    fourth_seed = 4567
    np.random.seed(fourth_seed)
    fourth_rng = np.random.RandomState(
        fourth_seed
    )  #random seed, control the model generates the same results
    fourth_srng = RandomStreams(fourth_rng.randint(666666))

    fifth_seed = 5678
    np.random.seed(fifth_seed)
    fifth_rng = np.random.RandomState(
        fifth_seed)  #random seed, control the model generates the same results
    fifth_srng = RandomStreams(fifth_rng.randint(555555))

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_extra, all_labels, word2id = load_SNLI_dataset_with_extra(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    #     train_sents_l = np.concatenate((train_sents_l, dev_sents_l), axis=0)
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    #     train_masks_l = np.concatenate((train_masks_l, dev_masks_l), axis=0)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    #     train_sents_r = np.concatenate((train_sents_r, dev_sents_r), axis=0)
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    #     train_masks_r = np.concatenate((train_masks_r, dev_masks_r), axis=0)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_extra = np.asarray(all_extra[0], dtype=theano.config.floatX)
    dev_extra = np.asarray(all_extra[1], dtype=theano.config.floatX)
    test_extra = np.asarray(all_extra[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    #     train_labels_store = np.concatenate((train_labels_store, dev_labels_store), axis=0)
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = first_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    #     word2vec =extend_word2vec_lowercase(word2vec)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    #     normed_matrix = normalize(rand_values, axis=0, norm='l2')
    first_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    second_rand_values = second_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    second_rand_values[0] = np.array(np.zeros(emb_size),
                                     dtype=theano.config.floatX)
    second_rand_values = load_word2vec_to_init(second_rand_values, id2word,
                                               word2vec)
    second_embeddings = theano.shared(
        value=np.array(second_rand_values, dtype=theano.config.floatX),
        borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    third_rand_values = third_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    third_rand_values[0] = np.array(np.zeros(emb_size),
                                    dtype=theano.config.floatX)
    third_rand_values = load_word2vec_to_init(third_rand_values, id2word,
                                              word2vec)
    third_embeddings = theano.shared(value=np.array(
        third_rand_values, dtype=theano.config.floatX),
                                     borrow=True)

    fourth_rand_values = fourth_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    fourth_rand_values[0] = np.array(np.zeros(emb_size),
                                     dtype=theano.config.floatX)
    fourth_rand_values = load_word2vec_to_init(fourth_rand_values, id2word,
                                               word2vec)
    fourth_embeddings = theano.shared(value=np.array(
        fourth_rand_values, dtype=theano.config.floatX),
                                      borrow=True)

    fifth_rand_values = fifth_rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    fifth_rand_values[0] = np.array(np.zeros(emb_size),
                                    dtype=theano.config.floatX)
    fifth_rand_values = load_word2vec_to_init(fifth_rand_values, id2word,
                                              word2vec)
    fifth_embeddings = theano.shared(value=np.array(
        fifth_rand_values, dtype=theano.config.floatX),
                                     borrow=True)

    #now, start to build the input form of the model
    train_flag = T.iscalar()
    first_sents_ids_l = T.imatrix()
    first_sents_mask_l = T.fmatrix()
    first_sents_ids_r = T.imatrix()
    first_sents_mask_r = T.fmatrix()
    first_labels = T.ivector()
    second_sents_ids_l = T.imatrix()
    second_sents_mask_l = T.fmatrix()
    second_sents_ids_r = T.imatrix()
    second_sents_mask_r = T.fmatrix()
    second_labels = T.ivector()
    third_sents_ids_l = T.imatrix()
    third_sents_mask_l = T.fmatrix()
    third_sents_ids_r = T.imatrix()
    third_sents_mask_r = T.fmatrix()
    third_labels = T.ivector()
    fourth_sents_ids_l = T.imatrix()
    fourth_sents_mask_l = T.fmatrix()
    fourth_sents_ids_r = T.imatrix()
    fourth_sents_mask_r = T.fmatrix()
    fourth_labels = T.ivector()
    fifth_sents_ids_l = T.imatrix()
    fifth_sents_mask_l = T.fmatrix()
    fifth_sents_ids_r = T.imatrix()
    fifth_sents_mask_r = T.fmatrix()
    fifth_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def common_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    first_common_input_l = common_input(
        first_embeddings, first_sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    first_common_input_r = common_input(
        first_embeddings, first_sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    second_common_input_l = common_input(
        second_embeddings, second_sents_ids_l
    )  #second_embeddings[second_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    second_common_input_r = common_input(
        second_embeddings, second_sents_ids_r
    )  #second_embeddings[second_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    third_common_input_l = common_input(
        third_embeddings, third_sents_ids_l
    )  #third_embeddings[third_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    third_common_input_r = common_input(
        third_embeddings, third_sents_ids_r
    )  #third_embeddings[third_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    fourth_common_input_l = common_input(
        fourth_embeddings, fourth_sents_ids_l
    )  #fourth_embeddings[fourth_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    fourth_common_input_r = common_input(
        fourth_embeddings, fourth_sents_ids_r
    )  #fourth_embeddings[fourth_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    fifth_common_input_l = common_input(
        fifth_embeddings, fifth_sents_ids_l
    )  #fifth_embeddings[fifth_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    fifth_common_input_r = common_input(
        fifth_embeddings, fifth_sents_ids_r
    )  #fifth_embeddings[fifth_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    gate_filter_shape = (hidden_size[0], 1, emb_size, 1)

    def create_CNN_params(rng):
        conv_W_2_pre, conv_b_2_pre = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2_gate, conv_b_2_gate = create_conv_para(
            rng, filter_shape=gate_filter_shape)
        conv_W_2, conv_b_2 = create_conv_para(rng,
                                              filter_shape=(hidden_size[1], 1,
                                                            hidden_size[0],
                                                            filter_size[0]))
        conv_W_2_context, conv_b_2_context = create_conv_para(
            rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))
        return conv_W_2_pre, conv_b_2_pre, conv_W_2_gate, conv_b_2_gate, conv_W_2, conv_b_2, conv_W_2_context, conv_b_2_context

    first_conv_W_pre, first_conv_b_pre, first_conv_W_gate, first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context, first_conv_b_context = create_CNN_params(
        first_rng)
    second_conv_W_pre, second_conv_b_pre, second_conv_W_gate, second_conv_b_gate, second_conv_W, second_conv_b, second_conv_W_context, second_conv_b_context = create_CNN_params(
        second_rng)
    third_conv_W_pre, third_conv_b_pre, third_conv_W_gate, third_conv_b_gate, third_conv_W, third_conv_b, third_conv_W_context, third_conv_b_context = create_CNN_params(
        third_rng)
    fourth_conv_W_pre, fourth_conv_b_pre, fourth_conv_W_gate, fourth_conv_b_gate, fourth_conv_W, fourth_conv_b, fourth_conv_W_context, fourth_conv_b_context = create_CNN_params(
        fourth_rng)
    fifth_conv_W_pre, fifth_conv_b_pre, fifth_conv_W_gate, fifth_conv_b_gate, fifth_conv_W, fifth_conv_b, fifth_conv_W_context, fifth_conv_b_context = create_CNN_params(
        fifth_rng)
    '''
    dropout paras
    '''

    def dropout_group(rng, conv_W_2_pre, conv_W_2_gate, conv_W_2,
                      conv_W_2_context):
        drop_conv_W_2_pre = dropout_layer(rng, conv_W_2_pre, drop_p,
                                          train_flag)
        drop_conv_W_2_gate = dropout_layer(rng, conv_W_2_gate, drop_p,
                                           train_flag)
        drop_conv_W_2 = dropout_layer(rng, conv_W_2, drop_p, train_flag)
        drop_conv_W_2_context = dropout_layer(rng, conv_W_2_context, drop_p,
                                              train_flag)
        return drop_conv_W_2_pre, drop_conv_W_2_gate, drop_conv_W_2, drop_conv_W_2_context

    drop_first_conv_W_pre, drop_first_conv_W_gate, drop_first_conv_W, drop_first_conv_W_context = dropout_group(
        first_srng, first_conv_W_pre, first_conv_W_gate, first_conv_W,
        first_conv_W_context)
    drop_second_conv_W_pre, drop_second_conv_W_gate, drop_second_conv_W, drop_second_conv_W_context = dropout_group(
        second_srng, second_conv_W_pre, second_conv_W_gate, second_conv_W,
        second_conv_W_context)
    drop_third_conv_W_pre, drop_third_conv_W_gate, drop_third_conv_W, drop_third_conv_W_context = dropout_group(
        third_srng, third_conv_W_pre, third_conv_W_gate, third_conv_W,
        third_conv_W_context)
    drop_fourth_conv_W_pre, drop_fourth_conv_W_gate, drop_fourth_conv_W, drop_fourth_conv_W_context = dropout_group(
        fourth_srng, fourth_conv_W_pre, fourth_conv_W_gate, fourth_conv_W,
        fourth_conv_W_context)
    drop_fifth_conv_W_pre, drop_fifth_conv_W_gate, drop_fifth_conv_W, drop_fifth_conv_W_context = dropout_group(
        fifth_srng, fifth_conv_W_pre, fifth_conv_W_gate, fifth_conv_W,
        fifth_conv_W_context)

    first_NN_para = [  #conv_W, conv_b,
        first_conv_W_pre, first_conv_b_pre, first_conv_W_gate,
        first_conv_b_gate, first_conv_W, first_conv_b, first_conv_W_context
    ]
    second_NN_para = [
        second_conv_W_pre, second_conv_b_pre, second_conv_W_gate,
        second_conv_b_gate, second_conv_W, second_conv_b, second_conv_W_context
    ]

    third_NN_para = [
        third_conv_W_pre, third_conv_b_pre, third_conv_W_gate,
        third_conv_b_gate, third_conv_W, third_conv_b, third_conv_W_context
    ]
    fourth_NN_para = [
        fourth_conv_W_pre, fourth_conv_b_pre, fourth_conv_W_gate,
        fourth_conv_b_gate, fourth_conv_W, fourth_conv_b, fourth_conv_W_context
    ]

    fifth_NN_para = [
        fifth_conv_W_pre, fifth_conv_b_pre, fifth_conv_W_gate,
        fifth_conv_b_gate, fifth_conv_W, fifth_conv_b, fifth_conv_W_context
    ]
    '''
    first classifier
    '''

    def classifier(rng, common_input_l, common_input_r, sents_mask_l,
                   sents_mask_r, drop_conv_W_2_pre, conv_b_2_pre,
                   drop_conv_W_2_gate, conv_b_2_gate, drop_conv_W_2, conv_b_2,
                   drop_conv_W_2_context, conv_b_2_context, labels):
        conv_layer_2_gate_l = Conv_with_Mask_with_Gate(
            rng,
            input_tensor3=common_input_l,
            mask_matrix=sents_mask_l,
            image_shape=(batch_size, 1, emb_size, maxSentLen),
            filter_shape=gate_filter_shape,
            W=drop_conv_W_2_pre,
            b=conv_b_2_pre,
            W_gate=drop_conv_W_2_gate,
            b_gate=conv_b_2_gate)
        conv_layer_2_gate_r = Conv_with_Mask_with_Gate(
            rng,
            input_tensor3=common_input_r,
            mask_matrix=sents_mask_r,
            image_shape=(batch_size, 1, emb_size, maxSentLen),
            filter_shape=gate_filter_shape,
            W=drop_conv_W_2_pre,
            b=conv_b_2_pre,
            W_gate=drop_conv_W_2_gate,
            b_gate=conv_b_2_gate)

        l_input_4_att = conv_layer_2_gate_l.output_tensor3  #conv_layer_2_gate_l.masked_conv_out_sigmoid*conv_layer_2_pre_l.masked_conv_out+(1.0-conv_layer_2_gate_l.masked_conv_out_sigmoid)*common_input_l
        r_input_4_att = conv_layer_2_gate_r.output_tensor3  #conv_layer_2_gate_r.masked_conv_out_sigmoid*conv_layer_2_pre_r.masked_conv_out+(1.0-conv_layer_2_gate_r.masked_conv_out_sigmoid)*common_input_r

        conv_layer_2 = Conv_for_Pair(
            rng,
            origin_input_tensor3=common_input_l,
            origin_input_tensor3_r=common_input_r,
            input_tensor3=l_input_4_att,
            input_tensor3_r=r_input_4_att,
            mask_matrix=sents_mask_l,
            mask_matrix_r=sents_mask_r,
            image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
            image_shape_r=(batch_size, 1, hidden_size[0], maxSentLen),
            filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]),
            filter_shape_context=(hidden_size[1], 1, hidden_size[0], 1),
            W=drop_conv_W_2,
            b=conv_b_2,
            W_context=drop_conv_W_2_context,
            b_context=conv_b_2_context)
        attentive_sent_embeddings_l_2 = conv_layer_2.attentive_maxpool_vec_l
        attentive_sent_embeddings_r_2 = conv_layer_2.attentive_maxpool_vec_r
        # attentive_sent_sumpool_l_2 = conv_layer_2.attentive_sumpool_vec_l
        # attentive_sent_sumpool_r_2 = conv_layer_2.attentive_sumpool_vec_r

        HL_layer_1_input = T.concatenate([
            attentive_sent_embeddings_l_2, attentive_sent_embeddings_r_2,
            attentive_sent_embeddings_l_2 * attentive_sent_embeddings_r_2
        ],
                                         axis=1)

        HL_layer_1_input_size = hidden_size[
            1] * 3  #+extra_size#+(maxSentLen*2+10*2)#+hidden_size[1]*3+1

        HL_layer_1 = HiddenLayer(rng,
                                 input=HL_layer_1_input,
                                 n_in=HL_layer_1_input_size,
                                 n_out=hidden_size[0],
                                 activation=T.nnet.relu)
        HL_layer_2 = HiddenLayer(rng,
                                 input=HL_layer_1.output,
                                 n_in=hidden_size[0],
                                 n_out=hidden_size[0],
                                 activation=T.nnet.relu)

        LR_input_size = HL_layer_1_input_size + 2 * hidden_size[0]
        U_a = create_ensemble_para(
            rng, 3, LR_input_size)  # the weight matrix hidden_size*2
        LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                             name='LR_b',
                             borrow=True)  #bias for each target class
        LR_para = [U_a, LR_b]

        LR_input = T.tanh(
            T.concatenate(
                [HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],
                axis=1))
        layer_LR = LogisticRegression(
            rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b
        )  #basically it is a multiplication between weight matrix and input feature vector
        loss = layer_LR.negative_log_likelihood(
            labels
        )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

        return loss, LR_para + HL_layer_1.params + HL_layer_2.params, layer_LR.p_y_given_x, layer_LR.errors(
            labels)

    first_loss, first_classifier_params, first_test_distr, first_error = classifier(
        first_rng, first_common_input_l, first_common_input_r,
        first_sents_mask_l, first_sents_mask_r, drop_first_conv_W_pre,
        first_conv_b_pre, drop_first_conv_W_gate, first_conv_b_gate,
        drop_first_conv_W, first_conv_b, drop_first_conv_W_context,
        first_conv_b_context, first_labels)
    second_loss, second_classifier_params, second_test_distr, second_error = classifier(
        second_rng, second_common_input_l, second_common_input_r,
        second_sents_mask_l, second_sents_mask_r, drop_second_conv_W_pre,
        second_conv_b_pre, drop_second_conv_W_gate, second_conv_b_gate,
        drop_second_conv_W, second_conv_b, drop_second_conv_W_context,
        second_conv_b_context, second_labels)
    third_loss, third_classifier_params, third_test_distr, third_error = classifier(
        third_rng, third_common_input_l, third_common_input_r,
        third_sents_mask_l, third_sents_mask_r, drop_third_conv_W_pre,
        third_conv_b_pre, drop_third_conv_W_gate, third_conv_b_gate,
        drop_third_conv_W, third_conv_b, drop_third_conv_W_context,
        third_conv_b_context, third_labels)
    fourth_loss, fourth_classifier_params, fourth_test_distr, fourth_error = classifier(
        fourth_rng, fourth_common_input_l, fourth_common_input_r,
        fourth_sents_mask_l, fourth_sents_mask_r, drop_fourth_conv_W_pre,
        fourth_conv_b_pre, drop_fourth_conv_W_gate, fourth_conv_b_gate,
        drop_fourth_conv_W, fourth_conv_b, drop_fourth_conv_W_context,
        fourth_conv_b_context, fourth_labels)
    fifth_loss, fifth_classifier_params, fifth_test_distr, fifth_error = classifier(
        fifth_rng, fifth_common_input_l, fifth_common_input_r,
        fifth_sents_mask_l, fifth_sents_mask_r, drop_fifth_conv_W_pre,
        fifth_conv_b_pre, drop_fifth_conv_W_gate, fifth_conv_b_gate,
        drop_fifth_conv_W, fifth_conv_b, drop_fifth_conv_W_context,
        fifth_conv_b_context, fifth_labels)
    '''
    testing, labels == second_labels
    '''
    all_prop_distr = first_test_distr + second_test_distr + third_test_distr + fourth_test_distr + fifth_test_distr
    first_preds = T.argmax(first_test_distr, axis=1).dimshuffle('x',
                                                                0)  #(1, batch)
    second_preds = T.argmax(second_test_distr,
                            axis=1).dimshuffle('x', 0)  #(1, batch)
    third_preds = T.argmax(third_test_distr, axis=1).dimshuffle('x',
                                                                0)  #(1, batch)
    fourth_preds = T.argmax(fourth_test_distr,
                            axis=1).dimshuffle('x', 0)  #(1, batch)
    fifth_preds = T.argmax(fifth_test_distr, axis=1).dimshuffle('x',
                                                                0)  #(1, batch)
    overall_preds = T.concatenate(
        [first_preds, second_preds, third_preds, fourth_preds, fifth_preds],
        axis=0)  #(5, batch)
    all_error = T.mean(T.neq(T.argmax(all_prop_distr, axis=1), first_labels))

    #     neg_labels = T.where( labels < 2, 2, labels-1)
    #     loss2=-T.mean(T.log(1.0/(1.0+layer_LR.p_y_given_x))[T.arange(neg_labels.shape[0]), neg_labels])

    # rank loss
    # entail_prob_batch = T.nnet.softmax(layer_LR.before_softmax.T)[2] #batch
    # entail_ids = elementwise_is_two(labels)
    # entail_probs = entail_prob_batch[entail_ids.nonzero()]
    # non_entail_probs = entail_prob_batch[(1-entail_ids).nonzero()]
    #
    # repeat_entail = T.extra_ops.repeat(entail_probs, non_entail_probs.shape[0], axis=0)
    # repeat_non_entail = T.extra_ops.repeat(non_entail_probs.dimshuffle('x',0), entail_probs.shape[0], axis=0).flatten()
    # loss2 = -T.mean(T.log(entail_probs))#T.mean(T.maximum(0.0, margin-repeat_entail+repeat_non_entail))

    # zero_matrix = T.zeros((batch_size, 3))
    # filled_zero_matrix = T.set_subtensor(zero_matrix[T.arange(batch_size), labels], 1.0)
    # prob_batch_posi = layer_LR.p_y_given_x[filled_zero_matrix.nonzero()]
    # prob_batch_nega = layer_LR.p_y_given_x[(1-filled_zero_matrix).nonzero()]
    #
    # repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0)
    # repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten()
    # loss2 = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega))

    first_params = [first_embeddings] + first_NN_para + first_classifier_params
    second_params = [second_embeddings
                     ] + second_NN_para + second_classifier_params
    third_params = [third_embeddings] + third_NN_para + third_classifier_params
    fourth_params = [fourth_embeddings
                     ] + fourth_NN_para + fourth_classifier_params
    fifth_params = [fifth_embeddings] + fifth_NN_para + fifth_classifier_params

    params = first_params + second_params + third_params + fourth_params + fifth_params

    #     L2_reg =L2norm_paraList([embeddings,HL_layer_1.W, HL_layer_2.W])

    #     diversify_reg= (Diversify_Reg(conv_W_2_pre_to_matrix)+Diversify_Reg(conv_W_2_gate_to_matrix)+
    #                     Diversify_Reg(conv_W_2_to_matrix)+Diversify_Reg(conv_W_2_context_to_matrix))

    cost = first_loss + second_loss + third_loss + fourth_loss + fifth_loss  #+0.1*loss2#+loss2#+L2_weight*L2_reg

    first_updates = Gradient_Cost_Para(first_loss, first_params, learning_rate)
    second_updates = Gradient_Cost_Para(second_loss, second_params,
                                        learning_rate)
    third_updates = Gradient_Cost_Para(third_loss, third_params, learning_rate)
    fourth_updates = Gradient_Cost_Para(fourth_loss, fourth_params,
                                        learning_rate)
    fifth_updates = Gradient_Cost_Para(fifth_loss, fifth_params, learning_rate)
    updates = first_updates + second_updates + third_updates + fourth_updates + fifth_updates

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([
        train_flag, first_sents_ids_l, first_sents_mask_l, first_sents_ids_r,
        first_sents_mask_r, first_labels, second_sents_ids_l,
        second_sents_mask_l, second_sents_ids_r, second_sents_mask_r,
        second_labels, third_sents_ids_l, third_sents_mask_l,
        third_sents_ids_r, third_sents_mask_r, third_labels,
        fourth_sents_ids_l, fourth_sents_mask_l, fourth_sents_ids_r,
        fourth_sents_mask_r, fourth_labels, fifth_sents_ids_l,
        fifth_sents_mask_l, fifth_sents_ids_r, fifth_sents_mask_r, fifth_labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    #     train_model_pred = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra,labels,
    #                                         second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], [LR_input, labels], allow_input_downcast=True, on_unused_input='ignore')
    #
    #     dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, train_flag,extra, labels,
    #                                  second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([
        train_flag, first_sents_ids_l, first_sents_mask_l, first_sents_ids_r,
        first_sents_mask_r, first_labels, second_sents_ids_l,
        second_sents_mask_l, second_sents_ids_r, second_sents_mask_r,
        second_labels, third_sents_ids_l, third_sents_mask_l,
        third_sents_ids_r, third_sents_mask_r, third_labels,
        fourth_sents_ids_l, fourth_sents_mask_l, fourth_sents_ids_r,
        fourth_sents_mask_r, fourth_labels, fifth_sents_ids_l,
        fifth_sents_mask_l, fifth_sents_ids_r, fifth_sents_mask_r, fifth_labels
    ], [
        first_error, second_error, third_error, fourth_error, fifth_error,
        all_error, overall_preds
    ],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0
    max_acc_svm = 0.0
    max_acc_lr = 0.0
    max_majority_acc = 0.0

    cost_i = 0.0
    first_train_indices = range(train_size)
    second_train_indices = range(train_size)
    third_train_indices = range(train_size)
    fourth_train_indices = range(train_size)
    fifth_train_indices = range(train_size)
    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            first_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        random.Random(200).shuffle(second_train_indices)
        random.Random(300).shuffle(third_train_indices)
        random.Random(400).shuffle(fourth_train_indices)
        random.Random(500).shuffle(fifth_train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            first_train_id_batch = first_train_indices[batch_id:batch_id +
                                                       batch_size]
            second_train_id_batch = second_train_indices[batch_id:batch_id +
                                                         batch_size]
            third_train_id_batch = third_train_indices[batch_id:batch_id +
                                                       batch_size]
            fourth_train_id_batch = fourth_train_indices[batch_id:batch_id +
                                                         batch_size]
            fifth_train_id_batch = fifth_train_indices[batch_id:batch_id +
                                                       batch_size]
            cost_i += train_model(1, train_sents_l[first_train_id_batch],
                                  train_masks_l[first_train_id_batch],
                                  train_sents_r[first_train_id_batch],
                                  train_masks_r[first_train_id_batch],
                                  train_labels_store[first_train_id_batch],
                                  train_sents_l[second_train_id_batch],
                                  train_masks_l[second_train_id_batch],
                                  train_sents_r[second_train_id_batch],
                                  train_masks_r[second_train_id_batch],
                                  train_labels_store[second_train_id_batch],
                                  train_sents_l[third_train_id_batch],
                                  train_masks_l[third_train_id_batch],
                                  train_sents_r[third_train_id_batch],
                                  train_masks_r[third_train_id_batch],
                                  train_labels_store[third_train_id_batch],
                                  train_sents_l[fourth_train_id_batch],
                                  train_masks_l[fourth_train_id_batch],
                                  train_sents_r[fourth_train_id_batch],
                                  train_masks_r[fourth_train_id_batch],
                                  train_labels_store[fourth_train_id_batch],
                                  train_sents_l[fifth_train_id_batch],
                                  train_masks_l[fifth_train_id_batch],
                                  train_sents_r[fifth_train_id_batch],
                                  train_masks_r[fifth_train_id_batch],
                                  train_labels_store[fifth_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % int(2000 * (50.0 / batch_size)) == 0:
                #             if iter%int(200*(50.0 / batch_size))==0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()
                # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0:
                #     print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                #     past_time = time.time()

                #                 error_sum=0.0
                #                 for dev_batch_id in dev_batch_start: # for each test batch
                #                     error_i=dev_model(
                #                                 dev_sents_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_l[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_sents_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_masks_r[dev_batch_id:dev_batch_id+batch_size],
                #                                 dev_labels_store[dev_batch_id:dev_batch_id+batch_size]
                #                                 )
                #
                #                     error_sum+=error_i
                #                 dev_accuracy=1.0-error_sum/(len(dev_batch_start))
                #                 if dev_accuracy > max_acc_dev:
                #                     max_acc_dev=dev_accuracy
                #                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev
                #best dev model, do test
                error_sum_1 = 0.0
                error_sum_2 = 0.0
                error_sum_3 = 0.0
                error_sum_4 = 0.0
                error_sum_5 = 0.0
                error_both = 0.0
                ys = []
                gold_ys = []
                for test_batch_id in test_batch_start:  # for each test batch
                    error_1, error_2, error_3, error_4, error_5, both_error_batch, batch_ys = test_model(
                        0, test_sents_l[test_batch_id:test_batch_id +
                                        batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size],
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size],
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size],
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size],
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum_1 += error_1
                    error_sum_2 += error_2
                    error_sum_3 += error_3
                    error_sum_4 += error_4
                    error_sum_5 += error_5
                    error_both += both_error_batch
                    ys.append(batch_ys)
                    gold_ys.append(
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])
                test_acc_1 = 1.0 - error_sum_1 / (len(test_batch_start))
                test_acc_2 = 1.0 - error_sum_2 / (len(test_batch_start))
                test_acc_3 = 1.0 - error_sum_3 / (len(test_batch_start))
                test_acc_4 = 1.0 - error_sum_4 / (len(test_batch_start))
                test_acc_5 = 1.0 - error_sum_5 / (len(test_batch_start))

                all_ys = np.concatenate(ys, axis=1)  #(5, test_size)
                gold_ys = np.concatenate(gold_ys)
                majority_ys = mode(np.transpose(all_ys), axis=-1)[0][:, 0]
                majority_acc = 1.0 - np.not_equal(
                    gold_ys, majority_ys).sum() * 1.0 / len(gold_ys)
                test_acc_both = 1.0 - error_both / (len(test_batch_start))
                if test_acc_1 > max_acc_test:
                    max_acc_test = test_acc_1
                if test_acc_2 > max_acc_test:
                    max_acc_test = test_acc_2
                if test_acc_3 > max_acc_test:
                    max_acc_test = test_acc_3
                if test_acc_4 > max_acc_test:
                    max_acc_test = test_acc_4
                if test_acc_5 > max_acc_test:
                    max_acc_test = test_acc_5

                if test_acc_both > max_acc_test:
                    max_acc_test = test_acc_both
                    store_model_to_file(
                        '/mounts/data/proj/wenpeng/Dataset/StanfordEntailment/model_para_five_copies_'
                        + str(max_acc_test), params)
                if majority_acc > max_majority_acc:
                    max_majority_acc = majority_acc

                print '\t\tcurrent acc:', test_acc_1, test_acc_2, test_acc_3, test_acc_4, test_acc_5, ' ; ', test_acc_both, majority_acc, '\t\t\t\t\tmax_acc:', max_acc_test, max_majority_acc


#                 else:
#                     print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
コード例 #11
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=70,
                    filter_size=[3, 1],
                    maxSentLen=70,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, word2id = load_SNLI_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents_l = np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l = np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    train_masks_l = np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l = np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r = np.asarray(all_sentences_r[1], dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    train_masks_r = np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r = np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[0], dtype='int32')
    dev_labels_store = np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    train_size = len(train_labels_store)
    dev_size = len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' dev size: ', dev_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    "first randomly initialize each word in the matrix 'rand_values', then load pre-trained word2vec embeddinds to initialize words, uncovered"
    "words keep random initialization"
    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    'Use word ids in sentences to retrieve word embeddings from matrix "init_embeddings", each sentence will be in'
    'tensor2 (emb_size, sen_length), then the minibatch will be in tensor3 (batch_size, emb_size, sen_length) '
    embed_input_l = init_embeddings[sents_ids_l.flatten(
    )].reshape((batch_size, maxSentLen, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = init_embeddings[sents_ids_r.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)
    '''create parameters for attentive convolution function '''
    gate_filter_shape = (emb_size, 1, emb_size, 1)
    conv_W_pre, conv_b_pre = create_conv_para(rng,
                                              filter_shape=gate_filter_shape)
    conv_W_gate, conv_b_gate = create_conv_para(rng,
                                                filter_shape=gate_filter_shape)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    conv_W2_context, conv_b2_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, conv_W_context, conv_W_pre, conv_b_pre, conv_W_gate,
        conv_b_gate, conv_W2, conv_b2, conv_W2_context
    ]

    "A gated convolution layer to form more expressive word representations in each sentence"
    "input tensor3 (batch_size, emb_size, sen_length), output tensor3 (batch_size, emb_size, sen_length)"
    conv_layer_gate_l = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_l,
        mask_matrix=sents_mask_l,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    conv_layer_gate_r = Conv_with_Mask_with_Gate(
        rng,
        input_tensor3=embed_input_r,
        mask_matrix=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=gate_filter_shape,
        W=conv_W_pre,
        b=conv_b_pre,
        W_gate=conv_W_gate,
        b_gate=conv_b_gate)
    '''
    attentive convolution function, two sizes of filter_width 3&1 are used. Multi-channel
    '''

    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=conv_layer_gate_l.output_tensor3,
        input_tensor3_r=conv_layer_gate_r.output_tensor3,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_W2,
        b=conv_b2,
        W_context=conv_W2_context,
        b_context=conv_b2_context)
    attentive_sent_embeddings_l2 = attentive_conv_layer2.attentive_maxpool_vec_l
    attentive_sent_embeddings_r2 = attentive_conv_layer2.attentive_maxpool_vec_r

    "Batch normalization for the four output sentence representation vectors"
    gamma = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[0]),
        high=1.0 / math.sqrt(hidden_size[0]),
        size=(hidden_size[0])),
                                     dtype=theano.config.floatX),
                          borrow=True)
    beta = theano.shared(np.zeros((hidden_size[0]),
                                  dtype=theano.config.floatX),
                         borrow=True)
    bn_params = [gamma, beta]
    bn_attentive_sent_embeddings_l = batch_normalization(
        inputs=attentive_sent_embeddings_l,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r = batch_normalization(
        inputs=attentive_sent_embeddings_r,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r.std((0, ), keepdims=True),
        mode='low_mem')

    bn_attentive_sent_embeddings_l2 = batch_normalization(
        inputs=attentive_sent_embeddings_l2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_l2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_l2.std((0, ), keepdims=True),
        mode='low_mem')
    bn_attentive_sent_embeddings_r2 = batch_normalization(
        inputs=attentive_sent_embeddings_r2,
        gamma=gamma,
        beta=beta,
        mean=attentive_sent_embeddings_r2.mean((0, ), keepdims=True),
        std=attentive_sent_embeddings_r2.std((0, ), keepdims=True),
        mode='low_mem')

    "Before logistic regression layer, we insert a hidden layer. Now form input to HL classifier"
    HL_layer_1_input = T.concatenate([
        bn_attentive_sent_embeddings_l, bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l + bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l * bn_attentive_sent_embeddings_r,
        bn_attentive_sent_embeddings_l2, bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 + bn_attentive_sent_embeddings_r2,
        bn_attentive_sent_embeddings_l2 * bn_attentive_sent_embeddings_r2
    ],
                                     axis=1)
    HL_layer_1_input_size = 8 * hidden_size[0]
    "Create hidden layer parameters"
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, HL_layer_1_input_size, hidden_size[1])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    "Hidden Layer and batch norm to its output again"
    HL_layer_1 = HiddenLayer(rng,
                             input=HL_layer_1_input,
                             n_in=HL_layer_1_input_size,
                             n_out=hidden_size[1],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    gamma_HL = theano.shared(np.asarray(rng.uniform(
        low=-1.0 / math.sqrt(hidden_size[1]),
        high=1.0 / math.sqrt(hidden_size[1]),
        size=(hidden_size[1])),
                                        dtype=theano.config.floatX),
                             borrow=True)
    beta_HL = theano.shared(np.zeros((hidden_size[1]),
                                     dtype=theano.config.floatX),
                            borrow=True)
    bn_params_HL = [gamma_HL, beta_HL]
    bn_HL_output = batch_normalization(inputs=HL_layer_1.output,
                                       gamma=gamma_HL,
                                       beta=beta_HL,
                                       mean=HL_layer_1.output.mean(
                                           (0, ), keepdims=True),
                                       std=HL_layer_1.output.std(
                                           (0, ), keepdims=True),
                                       mode='low_mem')
    "Form input to LR classifier"
    LR_input = T.concatenate([HL_layer_1_input, bn_HL_output], axis=1)
    LR_input_size = HL_layer_1_input_size + hidden_size[1]
    U_a = create_ensemble_para(rng, 3, LR_input_size)  # (input_size, 3)
    LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    "Logistic Regression layer"
    layer_LR = LogisticRegression(
        rng,
        input=normalize_matrix_col_wise(LR_input),
        n_in=LR_input_size,
        n_out=3,
        W=U_a,
        b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [
        init_embeddings
    ] + NN_para + LR_para + bn_params + HL_layer_1_params + bn_params_HL
    cost = loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    dev_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels],
        layer_LR.errors(labels),
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(train_sents_l[train_id_batch],
                                  train_masks_l[train_id_batch],
                                  train_sents_r[train_id_batch],
                                  train_masks_r[train_id_batch],
                                  train_labels_store[train_id_batch])

            if (epoch == 1 and iter % 1000 == 0) or (epoch >= 2
                                                     and iter % 5 == 0):
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                dev_error_sum = 0.0
                for dev_batch_id in dev_batch_start:  # for each test batch
                    dev_error_i = dev_model(
                        dev_sents_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_l[dev_batch_id:dev_batch_id + batch_size],
                        dev_sents_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_masks_r[dev_batch_id:dev_batch_id + batch_size],
                        dev_labels_store[dev_batch_id:dev_batch_id +
                                         batch_size])

                    dev_error_sum += dev_error_i
                dev_acc = 1.0 - dev_error_sum / (len(dev_batch_start))

                if dev_acc > max_acc_dev:
                    max_acc_dev = dev_acc
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev
                    '''
                    best dev model, test
                    '''
                    error_sum = 0.0
                    for test_batch_id in test_batch_start:  # for each test batch
                        error_i = test_model(
                            test_sents_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_l[test_batch_id:test_batch_id +
                                         batch_size],
                            test_sents_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_masks_r[test_batch_id:test_batch_id +
                                         batch_size],
                            test_labels_store[test_batch_id:test_batch_id +
                                              batch_size])

                        error_sum += error_i
                    test_acc = 1.0 - error_sum / (len(test_batch_start))

                    if test_acc > max_acc_test:
                        max_acc_test = test_acc
                    print '\t\tcurrent test_acc:', test_acc, ' ; ', '\t\t\t\t\tmax_test_acc:', max_acc_test
                else:
                    print '\tcurrent dev_acc:', dev_acc, ' ; ', '\tmax_dev_acc:', max_acc_dev

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test