コード例 #1
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    emb_size=40,
                    batch_size=50,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, word2id = load_BBN_multi_labels_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dev_size = len(dev_labels)

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    # bow_mean_emb = bow_emb/T.sum(sents_mask,axis=1).dimshuffle(0,'x')

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    NN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 12, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((12, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)

    loss = -T.mean(T.log(prob_pos))
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)

    LR_att_input = gru_sent_embeddings  #T.concatenate([sent_att_embeddings,sent_att_embeddings2, bow_emb], axis=1)
    LR_att_input_size = hidden_size[0]
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_att_a = create_ensemble_para(
        rng, 12, LR_att_input_size)  # the weight matrix hidden_size*2
    LR_att_b = theano.shared(value=np.zeros((12, ),
                                            dtype=theano.config.floatX),
                             name='LR_b',
                             borrow=True)  #bias for each target class
    LR_att_para = [U_att_a, LR_att_b]
    layer_att_LR = LogisticRegression(
        rng,
        input=LR_att_input,
        n_in=LR_att_input_size,
        n_out=12,
        W=U_att_a,
        b=LR_att_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax)  #batch * 12
    att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix,
                           att_score_matrix)

    att_loss = -T.mean(T.log(att_prob_pos))

    params = NN_para + LR_para + GRU_NN_para + LR_att_para  # put all model parameters together
    cost = loss + att_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)
    '''
    testing
    '''

    ensemble_scores = T.max(T.concatenate([
        att_score_matrix.dimshuffle('x', 0, 1),
        score_matrix.dimshuffle('x', 0, 1)
    ],
                                          axis=0),
                            axis=0)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_id_matrix, sents_mask, labels],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_id_matrix, sents_mask, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_id_matrix, sents_mask],
                                 binarize_prob,
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    cost_i = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size])
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #2
0
ファイル: train_SQUAD.py プロジェクト: yinwenpeng/SQUAD
def evaluate_lenet5(learning_rate=0.5, n_epochs=2000, batch_size=500, emb_size=300, hidden_size=300,
                    L2_weight=0.0001, para_len_limit=700, q_len_limit=40):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit)
    train_size=len(train_para_list)
    if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist= load_dev_or_test(word2id, para_len_limit, q_len_limit)
    test_size=len(test_para_list)
    if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)

    id2word = {y:x for x,y in overall_word2id.iteritems()}
    word2vec=load_word2vec()
    


    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')   
    questions = T.imatrix('questions')  
    labels = T.imatrix('labels')
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')
    extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3)


    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    paragraph_input = embeddings[paragraph.flatten()].reshape((paragraph.shape[0], paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen)
#     
# #     BdGRU(rng, str(0), shape, X, mask, is_train = 1, batch_size = 1, p = 0.5)
#     
    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] 
    paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    para_reps=paragraph_model.output_tensor #(batch, emb, para_len)




 
    Qs_emb = embeddings[questions.flatten()].reshape((questions.shape[0], questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength)
    UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size)
    Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] 
    questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b)
    questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size)
    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)
    
    
    #attention distributions
    W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size+3) # 3 extra features
    
    norm_W_a1=normalize_matrix(W_a1)
    norm_W_a2=normalize_matrix(W_a2)
    norm_U_a=normalize_matrix(U_a)

    LR_b = theano.shared(value=numpy.zeros((2,),
                                                 dtype=theano.config.floatX),  # @UndefinedVariable
                               name='LR_b', borrow=True)
     
    attention_paras=[W_a1, W_a2, U_a, LR_b]
    
    transformed_para_reps=T.tanh(T.dot(para_reps.transpose((0, 2,1)), norm_W_a2))
    transformed_q_reps=T.tanh(T.dot(questions_reps, norm_W_a1))
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)    
    
    add_both=0.5*(transformed_para_reps+transformed_q_reps)
    prior_att=T.concatenate([add_both, normalize_matrix(extraF)], axis=2)
    
    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices=para_mask.flatten().nonzero()[0]
    
    layer3=LogisticRegression(rng, input=prior_att.reshape((batch_size*prior_att.shape[1], hidden_size+3)), n_in=hidden_size+3, n_out=2, W=norm_U_a, b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.mean(T.log(layer3.p_y_given_x)[valid_indices, labels.flatten()[valid_indices]])#[T.arange(y.shape[0]), y])

    distributions=layer3.p_y_given_x[:,-1].reshape((batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
    masked_dis=distributions*para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()


    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''
   
#     def AttentionLayer(q_rep, ext_M):
#         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
#         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
#        f __name__ == '__main__': 
#         prior_att=T.concatenate([prior_att, ext_M], axis=1)
#                               
#         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
#         return strength.transpose() #(1, #words)
 
#     distributions, updates = theano.scan(
#     AttentionLayer,
#     sequences=[questions_reps,extraF] )
    
#     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
#     labels=debug_print(labels, 'labels')
#     label_mask=T.gt(labels,0.0)
#     neg_label_mask=T.lt(labels,0.0)
#     dis_masked=distributions*label_mask
#     remain_dis_masked=distributions*neg_label_mask
#     pos_error=((dis_masked-1)**2).mean()
#     neg_error=((remain_dis_masked-(-1))**2).mean()
#     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error
    


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [embeddings]+paragraph_para+Q_para+attention_paras
    L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=error#+L2_weight*L2_reg
    
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
        
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
  
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))    



    train_model = theano.function([paragraph, questions,labels, para_mask, q_mask, extraF], error, updates=updates,on_unused_input='ignore')
    
    test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False
    

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[test_size-batch_size]

        
    max_exact_acc=0.0
    cost_i=0.0
    
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #shuffle(train_batch_start)
        iter_accu=0
        for para_id in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
#             haha=para_mask[para_id:para_id+batch_size]
#             print haha
#             for i in range(batch_size):
#                 print len(haha[i])
            cost_i+= train_model(
                                np.asarray(train_para_list[para_id:para_id+batch_size], dtype='int32'), 
                                      np.asarray(train_Q_list[para_id:para_id+batch_size], dtype='int32'), 
                                      np.asarray(train_label_list[para_id:para_id+batch_size], dtype='int32'), 
                                      np.asarray(train_para_mask[para_id:para_id+batch_size], dtype=theano.config.floatX),
                                      np.asarray(train_mask[para_id:para_id+batch_size], dtype=theano.config.floatX),
                                      np.asarray(train_feature_matrixlist[para_id:para_id+batch_size], dtype=theano.config.floatX))

            #print iter
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                  
                exact_match=0.0
                q_amount=0
                for test_para_id in test_batch_start:
                    distribution_matrix=test_model(
                                        np.asarray(test_para_list[test_para_id:test_para_id+batch_size], dtype='int32'), 
                                              np.asarray(test_Q_list[test_para_id:test_para_id+batch_size], dtype='int32'), 
                                              np.asarray(test_para_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX),
                                              np.asarray(test_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX),
                                              np.asarray(test_feature_matrixlist[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX))
                    
#                     print distribution_matrix
                    test_para_wordlist_list=test_text_list[test_para_id:test_para_id+batch_size]
                    para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+batch_size]
                    paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size]
                    sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size]
                    para_len=len(test_para_wordlist_list[0])
                    if para_len!=len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount+=batch_size
#                     print q_size
#                     print test_para_word_list
                    for q in range(batch_size): #for each question
#                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
#                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
#                         else:
#                             ss=len(distribution_matrix[q])
#                             combine_list=[]
#                             for ii in range(ss):
#                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
#                             print combine_list
#                         exit(0)
#                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q])
                        q_gold_ans_set=para_gold_ansset_list[q]
                         
                        F1=MacroF1(pred_ans, q_gold_ans_set)
                        exact_match+=F1
#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                exact_acc=exact_match/q_amount
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                print 'current average F1:', exact_acc, '\t\tmax F1:', max_exact_acc
                        



            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #3
0
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, batch_size=20, test_batch_size=200, emb_size=300, hidden_size=300,
                    L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=50.302743615):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)
    

#     glove_vocab=set(word2vec.keys())
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit)
    train_size=len(train_para_list)
    if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)
    
    test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test(word2id, para_len_limit, q_len_limit)
    test_size=len(test_para_list)
    if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)





    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
#     id2word = {y:x for x,y in overall_word2id.iteritems()}
#     word2vec=load_glove()
#     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
#     labels = T.imatrix('labels')  #(batch, para_len)
    gold_indices= T.ivector() #batch
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')
    extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3)
    is_train = T.iscalar()


    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size=paragraph.shape[0]

    norm_extraF=normalize_matrix(extraF)

    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b]

    U_e1, W_e1, b_e1=create_GRU_para(rng, 3*hidden_size+3, hidden_size)
    U_e1_b, W_e1_b, b_e1_b=create_GRU_para(rng, 3*hidden_size+3, hidden_size)
    paragraph_para_e1=[U_e1, W_e1, b_e1, U_e1_b, W_e1_b, b_e1_b]


    UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size)
    Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b]

#     W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
#     W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 1, 2*hidden_size) # 3 extra features
#     LR_b = theano.shared(value=numpy.zeros((2,),
#                                                  dtype=theano.config.floatX),  # @UndefinedVariable
#                                name='LR_b', borrow=True)

    HL_paras=[U_a]
    params = [embeddings]+paragraph_para+Q_para+paragraph_para_e1+HL_paras

    load_model_from_file(rootPath+'Best_Paras_conv_50.302743614', params)

    paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen)
    concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1)

    paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    para_reps=paragraph_model.output_tensor #(batch, emb, para_len)

#     #LSTM
#     fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
#     paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
#     para_reps=paragraph_model.output_tensor

    Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength)
    questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b)
    questions_reps_tensor=questions_model.output_tensor
    questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden)
    questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)  #(batch, para_len, hidden)

#     #LSTM for questions
#     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
#     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
#     questions_reps_tensor=questions_model.output_tensor






    #
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (hidden, len)
        transpose_para_matrix=para_matrix.T
        interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len)
        norm_interaction_matrix=T.nnet.softmax(interaction_matrix)
#         norm_interaction_matrix=T.maximum(0.0, interaction_matrix)
        return T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(len, para_len)
    batch_q_reps, updates = theano.scan(fn=example_in_batch,
                                   outputs_info=None,
                                   sequences=[para_reps, questions_reps_tensor])    #batch_q_reps (batch, hidden, para_len)



    #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len)
    ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), norm_extraF.dimshuffle(0,2,1)], axis=1) #(batch, 3*hidden+3, para_len)

    
    para_ensemble_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,U=U_e1,W=W_e1,b=b_e1,Ub=U_e1_b,Wb=W_e1_b,bb=b_e1_b)
    para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, hidden ,para_len)
    para_reps_tensor4score = dropout_standard(is_train, para_reps_tensor4score, 0.2, rng)
    #for span reps
    span_1=T.concatenate([para_reps_tensor4score, para_reps_tensor4score], axis=1) #(batch, 2*hidden ,para_len)
    span_2=T.concatenate([para_reps_tensor4score[:,:,:-1], para_reps_tensor4score[:,:,1:]], axis=1) #(batch, 2*hidden ,para_len-1)
    span_3=T.concatenate([para_reps_tensor4score[:,:,:-2], para_reps_tensor4score[:,:,2:]], axis=1) #(batch, 2*hidden ,para_len-2)
    span_4=T.concatenate([para_reps_tensor4score[:,:,:-3], para_reps_tensor4score[:,:,3:]], axis=1) #(batch, 2*hidden ,para_len-3)
    span_5=T.concatenate([para_reps_tensor4score[:,:,:-4], para_reps_tensor4score[:,:,4:]], axis=1) #(batch, 2*hidden ,para_len-4)
    span_6=T.concatenate([para_reps_tensor4score[:,:,:-5], para_reps_tensor4score[:,:,5:]], axis=1) #(batch, 2*hidden ,para_len-5)
    span_7=T.concatenate([para_reps_tensor4score[:,:,:-6], para_reps_tensor4score[:,:,6:]], axis=1) #(batch, 2*hidden ,para_len-6)

    span_8=T.concatenate([para_reps_tensor4score[:,:,:-7], para_reps_tensor4score[:,:,7:]], axis=1) #(batch, 2*hidden ,para_len-7)
    span_9=T.concatenate([para_reps_tensor4score[:,:,:-8], para_reps_tensor4score[:,:,8:]], axis=1) #(batch, 2*hidden ,para_len-8)
    span_10=T.concatenate([para_reps_tensor4score[:,:,:-9], para_reps_tensor4score[:,:,9:]], axis=1) #(batch, 2*hidden ,para_len-9)
    span_11=T.concatenate([para_reps_tensor4score[:,:,:-10], para_reps_tensor4score[:,:,10:]], axis=1) #(batch, 2*hidden ,para_len-10)
    span_12=T.concatenate([para_reps_tensor4score[:,:,:-11], para_reps_tensor4score[:,:,11:]], axis=1) #(batch, 2*hidden ,para_len-11)
    span_13=T.concatenate([para_reps_tensor4score[:,:,:-12], para_reps_tensor4score[:,:,12:]], axis=1) #(batch, 2*hidden ,para_len-12)

    span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7,
                             span_8, span_9, span_10, span_11, span_12, span_13], axis=2) #(batch, 2*hidden, 13*para_len-78)
    test_span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7], axis=2) #(batch, 2*hidden, 5*para_len-10)  #, span_6, span_7
    #score each span reps
    norm_U_a=normalize_matrix(U_a)
    span_scores_tensor=T.dot(span_reps.dimshuffle(0,2,1), norm_U_a)  #(batch, 13*para_len-78, 1)
    span_scores=T.nnet.softmax(span_scores_tensor.reshape((true_batch_size, 13*paragraph.shape[1]-78))) #(batch, 7*para_len-21)
    loss=-T.sum(T.log(span_scores[T.arange(true_batch_size), gold_indices]))
    
    test_span_scores_tensor=T.dot(test_span_reps.dimshuffle(0,2,1), norm_U_a)  #(batch, 7*para_len-21, 1)
    test_span_scores=T.nnet.softmax(test_span_scores_tensor.reshape((true_batch_size, 7*paragraph.shape[1]-21))) #(batch, 7*para_len-21)    
    test_return=T.argmax(test_span_scores, axis=1) #batch


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
#     L2_reg = L2norm_paraList([embeddings])
    cost=loss#+ConvGRU_1.error#


    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
   
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
   
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, extraF, is_train], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF, is_train], test_return, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/test_batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size]


    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
#             haha=para_mask[para_id:para_id+batch_size]
#             print haha
#             for i in range(batch_size):
#                 print len(haha[i])
            cost_i+= train_model(
                                numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_label_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      1)

            #print iter
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
#                 writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8')
#                 writefile.write('{')
                pred_dict={}
#                 exact_match=0.0
#                 F1_match=0.0
                q_amount=0
                for test_para_id in test_batch_start:
                    batch_predict_ids=test_model(
                                        numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                              numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                              numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                              numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                              numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                              0)

#                     print distribution_matrix
                    test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size]
#                     para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size]
                    q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size]
#                     print 'q_ids_batch:', q_ids_batch
                    # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size]
                    # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size]
                    # para_len=len(test_para_wordlist_list[0])
                    # if para_len!=len(distribution_matrix[0]):
                    #     print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0])
                    #     exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount+=test_batch_size
#                     print q_size
#                     print test_para_word_list

#                     Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size]
                    for q in range(test_batch_size): #for each question
#                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
#                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
#                         else:
#                             ss=len(distribution_matrix[q])
#                             combine_list=[]
#                             for ii in range(ss):
#                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
#                             print combine_list
#                         exit(0)
#                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_list[q])
                        q_id=q_ids_batch[q]
                        pred_dict[q_id]=pred_ans
#                         writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ')
                        # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q])
#                         q_gold_ans_set=para_gold_ansset_list[q]
# #                         print test_para_wordlist_list[q]
# #                         print Q_list_inword[q]
# #                         print pred_ans.encode('utf8'), q_gold_ans_set
#                         if pred_ans in q_gold_ans_set:
#                             exact_match+=1
#                         F1=MacroF1(pred_ans, q_gold_ans_set)
#                         F1_match+=F1
                with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt')
#                 F1_acc=F1_match/q_amount
#                 exact_acc=exact_match/q_amount
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(rootPath+'Best_Paras_conv_'+str(max_exact_acc), params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc
                
                
#                 os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt')




            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #4
0
ファイル: train_SQUAD_conv.py プロジェクト: yinwenpeng/SQUAD
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=10, hidden_size=10,
                    L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=0.217545454546):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit)
    train_size=len(train_para_list)
    if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist= load_dev_or_test(word2id, para_len_limit, q_len_limit)
    test_size=len(test_para_list)
    if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)


    


    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
#     id2word = {y:x for x,y in overall_word2id.iteritems()}
#     word2vec=load_word2vec()
#     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')   
    questions = T.imatrix('questions')  
    labels = T.imatrix('labels')
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')
    extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3)


    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    
    norm_extraF=normalize_matrix(extraF)

    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] 

    UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size)
    Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] 

    W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size+3) # 3 extra features
    LR_b = theano.shared(value=numpy.zeros((2,),
                                                 dtype=theano.config.floatX),  # @UndefinedVariable
                               name='LR_b', borrow=True)
     
    attention_paras=[W_a1, W_a2, U_a, LR_b]  
    params = [embeddings]+paragraph_para+Q_para+attention_paras
    
    load_model_from_file(rootPath+'Best_Paras_conv_0.217545454545', params)
    
    paragraph_input = embeddings[paragraph.flatten()].reshape((paragraph.shape[0], paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen)
    concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1)


    paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    para_reps=paragraph_model.output_tensor #(batch, emb, para_len)

#     #LSTM
#     fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
#     paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
#     para_reps=paragraph_model.output_tensor
 
    Qs_emb = embeddings[questions.flatten()].reshape((questions.shape[0], questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength)

    questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b)
#     questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size)
    questions_reps_tensor=questions_model.output_tensor
    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)
    
#     #LSTM for questions
#     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
#     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
#     questions_reps_tensor=questions_model.output_tensor
        
#use CNN for question modeling
#     Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen)
#     conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5))
#     Q_conv_para=[conv_W, conv_b]
#     conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4,
#             image_shape=(batch_size, 1, emb_size, q_len_limit),
#             filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b)
#     conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1)
#     gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1))
#     masked_conv_output=conv_output*gru_mask
#     questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size))





    
#     new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0)
#     ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2)
#     ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction
#     padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX)
#     ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) 
#     ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) 
#     ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad)
    
    
    #
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (hidden, len)
        transpose_para_matrix=para_matrix.T
        interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len)
        norm_interaction_matrix=T.nnet.softmax(interaction_matrix)
        return T.dot(q_matrix, norm_interaction_matrix.T) #(len, para_len)
    batch_q_reps, updates = theano.scan(fn=example_in_batch,
                                   outputs_info=None,
                                   sequences=[para_reps, questions_reps_tensor])    #batch_q_reps (batch, hidden, para_len)
    
       
    #attention distributions
  
    norm_W_a1=normalize_matrix(W_a1)
    norm_W_a2=normalize_matrix(W_a2)
    norm_U_a=normalize_matrix(U_a)


    
    transformed_para_reps=T.maximum(T.dot(para_reps.transpose((0, 2,1)), norm_W_a2),0.0)   #relu
    transformed_q_reps=T.maximum(T.dot(batch_q_reps.transpose((0, 2,1)), norm_W_a1),0.0)
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)    
    
    add_both=transformed_para_reps+transformed_q_reps

#     U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size)
#     U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size)
#     accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b] 
#     accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b)    
#     accu_both=accumu_model.output_tensor.transpose((0,2,1))
    
    prior_att=T.concatenate([add_both, norm_extraF], axis=2)
    
    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices=para_mask.flatten().nonzero()[0]
    
    layer3=LogisticRegression(rng, input=prior_att.reshape((batch_size*prior_att.shape[1], hidden_size+3)), n_in=hidden_size+3, n_out=2, W=norm_U_a, b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.sum(T.log(layer3.p_y_given_x)[valid_indices, labels.flatten()[valid_indices]])#[T.arange(y.shape[0]), y])

    distributions=layer3.p_y_given_x[:,-1].reshape((batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
#     masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask
    masked_dis=distributions*para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()
    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''
   
#     def AttentionLayer(q_rep, ext_M):
#         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
#         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
#        f __name__ == '__main__': 
#         prior_att=T.concatenate([prior_att, ext_M], axis=1)
#                               
#         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
#         return strength.transpose() #(1, #words)
 
#     distributions, updates = theano.scan(
#     AttentionLayer,
#     sequences=[questions_reps,extraF] )
    
#     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
#     labels=debug_print(labels, 'labels')
#     label_mask=T.gt(labels,0.0)
#     neg_label_mask=T.lt(labels,0.0)
#     dis_masked=distributions*label_mask
#     remain_dis_masked=distributions*neg_label_mask
#     pos_error=((dis_masked-1)**2).mean()
#     neg_error=((remain_dis_masked-(-1))**2).mean()
#     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error
    


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    
    L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=error#+ConvGRU_1.error#
    
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
        
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
  
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))    



    train_model = theano.function([paragraph, questions,labels, para_mask, q_mask, extraF], cost, updates=updates,on_unused_input='ignore')
    
    test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False
    

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[test_size-batch_size]

        
    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        
        random.shuffle(train_ids)
        iter_accu=0
        for para_id in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
#             haha=para_mask[para_id:para_id+batch_size]
#             print haha
#             for i in range(batch_size):
#                 print len(haha[i])
            cost_i+= train_model(
                                np.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), 
                                      np.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), 
                                      np.asarray([train_label_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), 
                                      np.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      np.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      np.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX))

            #print iter
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                  
                exact_match=0.0
                F1_match=0.0
                q_amount=0
                for test_para_id in test_batch_start:
                    distribution_matrix=test_model(
                                        np.asarray(test_para_list[test_para_id:test_para_id+batch_size], dtype='int32'), 
                                              np.asarray(test_Q_list[test_para_id:test_para_id+batch_size], dtype='int32'), 
                                              np.asarray(test_para_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX),
                                              np.asarray(test_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX),
                                              np.asarray(test_feature_matrixlist[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX))
                    
#                     print distribution_matrix
                    test_para_wordlist_list=test_text_list[test_para_id:test_para_id+batch_size]
                    para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+batch_size]
                    paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size]
                    sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size]
                    para_len=len(test_para_wordlist_list[0])
                    if para_len!=len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount+=batch_size
#                     print q_size
#                     print test_para_word_list
                    
                    Q_list_inword=test_Q_list_word[test_para_id:test_para_id+batch_size]
                    for q in range(batch_size): #for each question
#                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
#                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
#                         else:
#                             ss=len(distribution_matrix[q])
#                             combine_list=[]
#                             for ii in range(ss):
#                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
#                             print combine_list
#                         exit(0)
#                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q])
                        q_gold_ans_set=para_gold_ansset_list[q]
#                         print test_para_wordlist_list[q]
#                         print Q_list_inword[q]
#                         print pred_ans.encode('utf8'), q_gold_ans_set
                        if pred_ans in q_gold_ans_set:
                            exact_match+=1
                        F1=MacroF1(pred_ans, q_gold_ans_set)
                        F1_match+=F1
#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                F1_acc=F1_match/q_amount
                exact_acc=exact_match/q_amount
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(rootPath+'Best_Paras_conv_'+str(max_exact_acc), params)
                        print 'Finished storing best  params at:', max_exact_acc  
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc
                        



            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #5
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=4,
                    emb_size=300,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=200,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_concMT_BBN_NI_epoch4.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_lines, word2id = load_official_testData_il_and_MT(
        word2id, maxSentLen, test_file_path)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    # test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + '100k-ENG-multicca.300.ENG.txt',
        emb_root + '100k-IL9-multicca.d300.IL9.txt'
    ], 300)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        [binarize_prob, ensemble_scores, sum_tensor3],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    # max_meanf1_test=0.0
    # max_weightf1_test=0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_types = []
                pred_confs = []
                pred_others = []
                for i, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    pred_types_i, pred_conf_i, pred_fields_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    if i < len(test_batch_start) - 1:
                        pred_types.append(pred_types_i)
                        pred_confs.append(pred_conf_i)
                        pred_others.append(pred_fields_i)
                    else:
                        pred_types.append(pred_types_i[-n_test_remain:])
                        pred_confs.append(pred_conf_i[-n_test_remain:])
                        pred_others.append(pred_fields_i[-n_test_remain:])
                pred_types = np.concatenate(pred_types, axis=0)
                pred_confs = np.concatenate(pred_confs, axis=0)
                pred_others = np.concatenate(pred_others, axis=0)
                mean_frame = generate_2018_official_output(
                    test_lines, output_file_path, pred_types, pred_confs,
                    pred_others, min_mean_frame)
                if mean_frame < min_mean_frame:
                    min_mean_frame = mean_frame
                print '\t\t\t test  over, min_mean_frame:', min_mean_frame

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #6
0
ファイル: train_SQUAD_conv.py プロジェクト: oneproton/SQUAD
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=2000,
                    batch_size=100,
                    emb_size=10,
                    hidden_size=10,
                    L2_weight=0.0001,
                    para_len_limit=400,
                    q_len_limit=40,
                    max_EM=0.217545454546):

    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train(
        para_len_limit, q_len_limit)
    train_size = len(train_para_list)
    if train_size != len(train_Q_list) or train_size != len(
            train_label_list) or train_size != len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test(
        word2id, para_len_limit, q_len_limit)
    test_size = len(test_para_list)
    if test_size != len(test_Q_list) or test_size != len(
            test_mask) or test_size != len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)

    rand_values = random_value_normal((overall_vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    #     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #     id2word = {y:x for x,y in overall_word2id.iteritems()}
    #     word2vec=load_word2vec()
    #     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    labels = T.imatrix('labels')
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')
    extraF = T.ftensor3('extraF')  # should be in shape (batch, wordsize, 3)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    norm_extraF = normalize_matrix(extraF)

    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b]

    UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size)
    Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b]

    W_a1 = create_ensemble_para(
        rng, hidden_size,
        hidden_size)  # init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size + 3)  # 3 extra features
    LR_b = theano.shared(
        value=numpy.zeros((2, ),
                          dtype=theano.config.floatX),  # @UndefinedVariable
        name='LR_b',
        borrow=True)

    attention_paras = [W_a1, W_a2, U_a, LR_b]
    params = [embeddings] + paragraph_para + Q_para + attention_paras

    load_model_from_file(rootPath + 'Best_Paras_conv_0.217545454545', params)

    paragraph_input = embeddings[paragraph.flatten()].reshape(
        (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose(
            (0, 2, 1))  # (batch_size, emb_size, maxparalen)
    concate_paragraph_input = T.concatenate(
        [paragraph_input, norm_extraF.dimshuffle((0, 2, 1))], axis=1)

    paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=paragraph_input,
        Mask=para_mask,
        hidden_dim=hidden_size,
        U=U1,
        W=W1,
        b=b1,
        Ub=U1_b,
        Wb=W1_b,
        bb=b1_b)
    para_reps = paragraph_model.output_tensor  #(batch, emb, para_len)

    #     #LSTM
    #     fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
    #     paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
    #     para_reps=paragraph_model.output_tensor

    Qs_emb = embeddings[questions.flatten()].reshape(
        (questions.shape[0], questions.shape[1], emb_size)).transpose(
            (0, 2, 1))  #(#questions, emb_size, maxsenlength)

    questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=Qs_emb,
        Mask=q_mask,
        hidden_dim=hidden_size,
        U=UQ,
        W=WQ,
        b=bQ,
        Ub=UQ_b,
        Wb=WQ_b,
        bb=bQ_b)
    #     questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size)
    questions_reps_tensor = questions_model.output_tensor

    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)

    #     #LSTM for questions
    #     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
    #     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
    #     questions_reps_tensor=questions_model.output_tensor

    #use CNN for question modeling
    #     Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen)
    #     conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5))
    #     Q_conv_para=[conv_W, conv_b]
    #     conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4,
    #             image_shape=(batch_size, 1, emb_size, q_len_limit),
    #             filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b)
    #     conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1)
    #     gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1))
    #     masked_conv_output=conv_output*gru_mask
    #     questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size))

    #     new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0)
    #     ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2)
    #     ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction
    #     padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX)
    #     ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1)
    #     ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1)
    #     ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad)

    #
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (hidden, len)
        transpose_para_matrix = para_matrix.T
        interaction_matrix = T.dot(transpose_para_matrix,
                                   q_matrix)  #(para_len, q_len)
        norm_interaction_matrix = T.nnet.softmax(interaction_matrix)
        return T.dot(q_matrix, norm_interaction_matrix.T)  #(len, para_len)

    batch_q_reps, updates = theano.scan(
        fn=example_in_batch,
        outputs_info=None,
        sequences=[para_reps, questions_reps_tensor
                   ])  #batch_q_reps (batch, hidden, para_len)

    #attention distributions

    norm_W_a1 = normalize_matrix(W_a1)
    norm_W_a2 = normalize_matrix(W_a2)
    norm_U_a = normalize_matrix(U_a)

    transformed_para_reps = T.maximum(
        T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2), 0.0)  #relu
    transformed_q_reps = T.maximum(
        T.dot(batch_q_reps.transpose((0, 2, 1)), norm_W_a1), 0.0)
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)

    add_both = transformed_para_reps + transformed_q_reps

    #     U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size)
    #     U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size)
    #     accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b]
    #     accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b)
    #     accu_both=accumu_model.output_tensor.transpose((0,2,1))

    prior_att = T.concatenate([add_both, norm_extraF], axis=2)

    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices = para_mask.flatten().nonzero()[0]

    layer3 = LogisticRegression(rng,
                                input=prior_att.reshape(
                                    (batch_size * prior_att.shape[1],
                                     hidden_size + 3)),
                                n_in=hidden_size + 3,
                                n_out=2,
                                W=norm_U_a,
                                b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.sum(
        T.log(layer3.p_y_given_x)
        [valid_indices,
         labels.flatten()[valid_indices]])  #[T.arange(y.shape[0]), y])

    distributions = layer3.p_y_given_x[:, -1].reshape(
        (batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
    #     masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask
    masked_dis = distributions * para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()
    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''

    #     def AttentionLayer(q_rep, ext_M):
    #         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
    #         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
    #        f __name__ == '__main__':
    #         prior_att=T.concatenate([prior_att, ext_M], axis=1)
    #
    #         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
    #         return strength.transpose() #(1, #words)

    #     distributions, updates = theano.scan(
    #     AttentionLayer,
    #     sequences=[questions_reps,extraF] )

    #     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
    #     labels=debug_print(labels, 'labels')
    #     label_mask=T.gt(labels,0.0)
    #     neg_label_mask=T.lt(labels,0.0)
    #     dis_masked=distributions*label_mask
    #     remain_dis_masked=distributions*neg_label_mask
    #     pos_error=((dis_masked-1)**2).mean()
    #     neg_error=((remain_dis_masked-(-1))**2).mean()
    #     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

    L2_reg = L2norm_paraList(
        [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost = error  #+ConvGRU_1.error#

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [paragraph, questions, labels, para_mask, q_mask, extraF],
        cost,
        updates=updates,
        on_unused_input='ignore')

    test_model = theano.function(
        [paragraph, questions, para_mask, q_mask, extraF],
        masked_dis,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(
        numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_F1_acc = 0.0
    max_exact_acc = 0.0
    cost_i = 0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            #             haha=para_mask[para_id:para_id+batch_size]
            #             print haha
            #             for i in range(batch_size):
            #                 print len(haha[i])
            cost_i += train_model(
                np.asarray([
                    train_para_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_Q_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_label_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_para_mask[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX),
                np.asarray([
                    train_mask[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX),
                np.asarray([
                    train_feature_matrixlist[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX))

            #print iter
            if iter % 10 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()

                exact_match = 0.0
                F1_match = 0.0
                q_amount = 0
                for test_para_id in test_batch_start:
                    distribution_matrix = test_model(
                        np.asarray(test_para_list[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype='int32'),
                        np.asarray(test_Q_list[test_para_id:test_para_id +
                                               batch_size],
                                   dtype='int32'),
                        np.asarray(test_para_mask[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(test_mask[test_para_id:test_para_id +
                                             batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(
                            test_feature_matrixlist[test_para_id:test_para_id +
                                                    batch_size],
                            dtype=theano.config.floatX))

                    #                     print distribution_matrix
                    test_para_wordlist_list = test_text_list[
                        test_para_id:test_para_id + batch_size]
                    para_gold_ansset_list = q_ansSet_list[
                        test_para_id:test_para_id + batch_size]
                    paralist_extra_features = test_feature_matrixlist[
                        test_para_id:test_para_id + batch_size]
                    sub_para_mask = test_para_mask[test_para_id:test_para_id +
                                                   batch_size]
                    para_len = len(test_para_wordlist_list[0])
                    if para_len != len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(
                            distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount += batch_size
                    #                     print q_size
                    #                     print test_para_word_list

                    Q_list_inword = test_Q_list_word[
                        test_para_id:test_para_id + batch_size]
                    for q in range(batch_size):  #for each question
                        #                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
                        #                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
                        #                         else:
                        #                             ss=len(distribution_matrix[q])
                        #                             combine_list=[]
                        #                             for ii in range(ss):
                        #                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
                        #                             print combine_list
                        #                         exit(0)
                        #                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans = extract_ansList_attentionList(
                            test_para_wordlist_list[q], distribution_matrix[q],
                            np.asarray(paralist_extra_features[q],
                                       dtype=theano.config.floatX),
                            sub_para_mask[q], Q_list_inword[q])
                        q_gold_ans_set = para_gold_ansset_list[q]
                        #                         print test_para_wordlist_list[q]
                        #                         print Q_list_inword[q]
                        #                         print pred_ans.encode('utf8'), q_gold_ans_set
                        if pred_ans in q_gold_ans_set:
                            exact_match += 1
                        F1 = MacroF1(pred_ans, q_gold_ans_set)
                        F1_match += F1


#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                F1_acc = F1_match / q_amount
                exact_acc = exact_match / q_amount
                if F1_acc > max_F1_acc:
                    max_F1_acc = F1_acc
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(
                            rootPath + 'Best_Paras_conv_' + str(max_exact_acc),
                            params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #7
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    nkerns=[90, 90],
                    batch_size=1,
                    window_width=2,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=57,
                    max_d_length=59,
                    margin=1.0):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/'
    rng = numpy.random.RandomState(23455)
    train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DPNQ(
        rootPath + 'vocab_DPNQ.txt', rootPath +
        'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt',
        rootPath +
        'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt',
        max_s_length, maxSentLength,
        maxDocLength)  #vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    #     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    #     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    #     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

    # results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label),
    #          numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4),
    #         numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4),
    #         numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)]
    # return results, line_control
    [
        train_data_D, train_data_A1, train_data_A2, train_data_A3, train_Label,
        train_Length_D, train_Length_D_s, train_Length_A1, train_Length_A2,
        train_Length_A3, train_leftPad_D, train_leftPad_D_s, train_leftPad_A1,
        train_leftPad_A2, train_leftPad_A3, train_rightPad_D,
        train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2,
        train_rightPad_A3
    ] = train_data
    [
        test_data_D, test_data_A1, test_data_A2, test_data_A3, test_Label,
        test_Length_D, test_Length_D_s, test_Length_A1, test_Length_A2,
        test_Length_A3, test_leftPad_D, test_leftPad_D_s, test_leftPad_A1,
        test_leftPad_A2, test_leftPad_A3, test_rightPad_D, test_rightPad_D_s,
        test_rightPad_A1, test_rightPad_A2, test_rightPad_A3
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_DPNQ_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    #     index_Q = T.lvector()
    index_A1 = T.lvector()
    index_A2 = T.lvector()
    index_A3 = T.lvector()
    #     index_A4= T.lvector()
    #     y = T.lvector()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    #     len_Q=T.lscalar()
    len_A1 = T.lscalar()
    len_A2 = T.lscalar()
    len_A3 = T.lscalar()
    #     len_A4=T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    #     left_Q=T.lscalar()
    left_A1 = T.lscalar()
    left_A2 = T.lscalar()
    left_A3 = T.lscalar()
    #     left_A4=T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    #     right_Q=T.lscalar()
    right_A1 = T.lscalar()
    right_A2 = T.lscalar()
    right_A3 = T.lscalar()
    #     right_A4=T.lscalar()

    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength,
         emb_size)).transpose(0, 2, 1)  #.dimshuffle(0, 'x', 1, 2)
    #     layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()
    layer0_A2_input = embeddings[index_A2.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()
    layer0_A3_input = embeddings[index_A3.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()
    #     layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

    U, W, b = create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para = [U, W, b]
    #     conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
    #     layer2_para=[conv2_W, conv2_b]
    #     high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1])
    #     highW_para=[high_W, high_b]

    #load_model(params)

    layer0_D = GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D, :, :],
                                 lefts=left_D_s[left_D:-right_D],
                                 rights=right_D_s[left_D:-right_D],
                                 hidden_dim=nkerns[0],
                                 U=U,
                                 W=W,
                                 b=b)
    #     layer0_Q = GRU_Matrix_Input(X=layer0_Q_input[:,left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A1 = GRU_Matrix_Input(X=layer0_A1_input[:, left_A1:-right_A1],
                                 word_dim=emb_size,
                                 hidden_dim=nkerns[0],
                                 U=U,
                                 W=W,
                                 b=b,
                                 bptt_truncate=-1)
    layer0_A2 = GRU_Matrix_Input(X=layer0_A2_input[:, left_A2:-right_A2],
                                 word_dim=emb_size,
                                 hidden_dim=nkerns[0],
                                 U=U,
                                 W=W,
                                 b=b,
                                 bptt_truncate=-1)
    layer0_A3 = GRU_Matrix_Input(X=layer0_A3_input[:, left_A3:-right_A3],
                                 word_dim=emb_size,
                                 hidden_dim=nkerns[0],
                                 U=U,
                                 W=W,
                                 b=b,
                                 bptt_truncate=-1)
    #     layer0_A4 = GRU_Matrix_Input(X=layer0_A4_input[:,left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    #     layer0_Q_output=debug_print(layer0_Q.output_vector_mean, 'layer0_Q.output')
    layer0_A1_output = debug_print(layer0_A1.output_vector_mean,
                                   'layer0_A1.output')
    layer0_A2_output = debug_print(layer0_A2.output_vector_mean,
                                   'layer0_A2.output')
    layer0_A3_output = debug_print(layer0_A3.output_vector_mean,
                                   'layer0_A3.output')
    #     layer0_A4_output=debug_print(layer0_A4.output_vector_mean, 'layer0_A4.output')

    #
    #
    #     conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]))
    #     layer0_para=[conv_W, conv_b]
    conv2_W, conv2_b = create_conv_para(rng,
                                        filter_shape=(nkerns[1], 1, nkerns[0],
                                                      filter_sents[1]))
    layer2_para = [conv2_W, conv2_b]
    high_W, high_b = create_highw_para(
        rng, nkerns[0], nkerns[1]
    )  # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para = [high_W, high_b]
    params = layer2_para + layer0_para + highW_para  #+[embeddings]
    #     #load_model(params)
    #
    #     layer0_D = Conv_with_input_para(rng, input=layer0_D_input,
    #             image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    # #     layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
    # #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    # #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    #     layer0_A1 = Conv_with_input_para(rng, input=layer0_A1_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    #     layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    #     layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input,
    #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    # #     layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input,
    # #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
    # #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
    #
    #     layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output')
    # #     layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
    #     layer0_A1_output=debug_print(layer0_A1.output, 'layer0_A1.output')
    #     layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output')
    #     layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output')
    # #     layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output')

    #     layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    #    def __init__(self, rng, input_D, input_r, kern, left_D, right_D, dim, doc_len, topk): # length_l, length_r: valid lengths after conv
    layer1_DA1 = GRU_Average_Pooling_Scan(rng,
                                          input_D=layer0_D_output,
                                          input_r=layer0_A1_output,
                                          kern=nkerns[0],
                                          left_D=left_D,
                                          right_D=right_D,
                                          dim=maxSentLength + filter_words[1] -
                                          1,
                                          doc_len=maxDocLength,
                                          topk=3)
    layer1_DA2 = GRU_Average_Pooling_Scan(rng,
                                          input_D=layer0_D_output,
                                          input_r=layer0_A2_output,
                                          kern=nkerns[0],
                                          left_D=left_D,
                                          right_D=right_D,
                                          dim=maxSentLength + filter_words[1] -
                                          1,
                                          doc_len=maxDocLength,
                                          topk=3)
    layer1_DA3 = GRU_Average_Pooling_Scan(rng,
                                          input_D=layer0_D_output,
                                          input_r=layer0_A3_output,
                                          kern=nkerns[0],
                                          left_D=left_D,
                                          right_D=right_D,
                                          dim=maxSentLength + filter_words[1] -
                                          1,
                                          doc_len=maxDocLength,
                                          topk=3)
    #     layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0],
    #                                       left_D=left_D, right_D=right_D,
    #                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4,
    #                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1,
    #                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)

    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
    #     layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA1 = Conv_with_input_para(
        rng,
        input=layer1_DA1.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA2 = Conv_with_input_para(
        rng,
        input=layer1_DA2.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_DA3 = Conv_with_input_para(
        rng,
        input=layer1_DA3.output_D.reshape(
            (batch_size, 1, nkerns[0], dshape[1])),
        image_shape=(batch_size, 1, nkerns[0], dshape[1]),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #     layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
    #             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #conv single Q and A into doc level with same conv weights
    #     layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA1.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A2 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA2.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    layer2_A3 = Conv_with_input_para_one_col_featuremap(
        rng,
        input=layer1_DA3.output_QA_sent_level_rep.reshape(
            (batch_size, 1, nkerns[0], 1)),
        image_shape=(batch_size, 1, nkerns[0], 1),
        filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]),
        W=conv2_W,
        b=conv2_b)
    #     layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
    #             image_shape=(batch_size, 1, nkerns[0], 1),
    #             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #     layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A1_output_sent_rep_Dlevel = debug_print(
        layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    layer2_A2_output_sent_rep_Dlevel = debug_print(
        layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    layer2_A3_output_sent_rep_Dlevel = debug_print(
        layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')
    #     layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel')

    #     layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA1 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA1.output,
        input_r=layer2_A1_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA2 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA2.output,
        input_r=layer2_A2_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    layer3_DA3 = Average_Pooling_for_Top(
        rng,
        input_l=layer2_DA3.output,
        input_r=layer2_A3_output_sent_rep_Dlevel,
        kern=nkerns[1],
        left_l=left_D,
        right_l=right_D,
        left_r=0,
        right_r=0,
        length_l=len_D + filter_sents[1] - 1,
        length_r=1,
        dim=maxDocLength + filter_sents[1] - 1,
        topk=3)
    #     layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1],
    #                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
    #                       length_l=len_D+filter_sents[1]-1, length_r=1,
    #                        dim=maxDocLength+filter_sents[1]-1, topk=3)

    #high-way

    #     transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b),
        'transform_gate_DA1')
    transform_gate_DA2 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b),
        'transform_gate_DA2')
    transform_gate_DA3 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b),
        'transform_gate_DA3')
    #     transform_gate_DA4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b), 'transform_gate_DA4')
    #     transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q')
    transform_gate_A1 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b),
        'transform_gate_A1')
    transform_gate_A2 = debug_print(
        T.nnet.sigmoid(
            T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b),
        'transform_gate_A2')
    #     transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3')
    #     transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4')

    #     overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A1 = (
        1.0 - transform_gate_DA1
    ) * layer1_DA1.output_D_sent_level_rep + transform_gate_DA1 * layer3_DA1.output_D_doc_level_rep
    overall_D_A2 = (
        1.0 - transform_gate_DA2
    ) * layer1_DA2.output_D_sent_level_rep + transform_gate_DA2 * layer3_DA2.output_D_doc_level_rep
    overall_D_A3 = (
        1.0 - transform_gate_DA3
    ) * layer1_DA3.output_D_sent_level_rep + transform_gate_DA3 * layer3_DA3.output_D_doc_level_rep
    #     overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep

    #     overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A1 = (
        1.0 - transform_gate_A1
    ) * layer1_DA1.output_QA_sent_level_rep + transform_gate_A1 * layer2_A1.output_sent_rep_Dlevel
    overall_A2 = (
        1.0 - transform_gate_A2
    ) * layer1_DA2.output_QA_sent_level_rep + transform_gate_A2 * layer2_A2.output_sent_rep_Dlevel
    #     overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel
    #     overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel

    simi_sent_level1 = debug_print(
        cosine(layer1_DA1.output_D_sent_level_rep,
               layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')
    simi_sent_level2 = debug_print(
        cosine(layer1_DA2.output_D_sent_level_rep,
               layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2')
    #     simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3')
    #     simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4')

    simi_doc_level1 = debug_print(
        cosine(layer3_DA1.output_D_doc_level_rep,
               layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')
    simi_doc_level2 = debug_print(
        cosine(layer3_DA2.output_D_doc_level_rep,
               layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2')
    #     simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3')
    #     simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4')

    simi_overall_level1 = debug_print(cosine(overall_D_A1, overall_A1),
                                      'simi_overall_level1')
    simi_overall_level2 = debug_print(cosine(overall_D_A2, overall_A2),
                                      'simi_overall_level2')
    #     simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3')
    #     simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4')

    #     simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1
    #     simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2

    simi_1 = (simi_overall_level1 + simi_sent_level1 + simi_doc_level1) / 3.0
    simi_2 = (simi_overall_level2 + simi_sent_level2 + simi_doc_level2) / 3.0
    #     simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0
    #     simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0

    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    #     #only use overall_simi
    #     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    #     posi_simi=simi_overall_level1
    #     nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #use ensembled simi
    #     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
    #     cost=T.maximum(0.0, margin+simi_2-simi_1)
    simi_PQ = cosine(layer1_DA1.output_QA_sent_level_rep,
                     layer1_DA3.output_D_sent_level_rep)
    simi_NQ = cosine(layer1_DA2.output_QA_sent_level_rep,
                     layer1_DA3.output_D_sent_level_rep)
    #bad matching at overall level
    #     simi_PQ=cosine(overall_A1, overall_D_A3)
    #     simi_NQ=cosine(overall_A2, overall_D_A3)
    match_cost = T.maximum(0.0, margin + simi_NQ - simi_PQ)
    cost = T.maximum(
        0.0, margin + simi_sent_level2 - simi_sent_level1) + T.maximum(
            0.0, margin + simi_doc_level2 - simi_doc_level1) + T.maximum(
                0.0, margin + simi_overall_level2 - simi_overall_level1)
    cost = cost + match_cost
    #     posi_simi=simi_1
    #     nega_simi=simi_2

    L2_reg = debug_print(
        (high_W**2).sum() + 3 * (conv2_W**2).sum() + (U**2).sum() +
        (W**2).sum(), 'L2_reg'
    )  #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost = debug_print(cost + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index],
        [
            cost, simi_sent_level1, simi_sent_level2, simi_doc_level1,
            simi_doc_level2, simi_overall_level1, simi_overall_level2
        ],
        givens={
            index_D: test_data_D[index],  #a matrix
            #             index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
            #             index_A4: test_data_A4[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            #             len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
            #             len_A4: test_Length_A4[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            #             left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
            #             left_A4: test_leftPad_A4[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            #             right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index]
            #             right_A4: test_rightPad_A4[index]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))


#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         acc = acc_i + T.sqr(grad_i)
#         if param_i == embeddings:
#             updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size)))))   #AdaGrad
#         else:
#             updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))

    train_model = theano.function(
        [index],
        [
            cost, simi_sent_level1, simi_sent_level2, simi_doc_level1,
            simi_doc_level2, simi_overall_level1, simi_overall_level2
        ],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            #             index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            #             len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            #             left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index]
            #             right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index],
        [
            cost, simi_sent_level1, simi_sent_level2, simi_doc_level1,
            simi_doc_level2, simi_overall_level1, simi_overall_level2
        ],
        givens={
            index_D: train_data_D[index],
            #             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            #             index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            #             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            #             len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            #             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            #             left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            #             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index]
            #             right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        shuffle(train_batch_start)  #shuffle training data

        posi_train_sent = []
        nega_train_sent = []
        posi_train_doc = []
        nega_train_doc = []
        posi_train_overall = []
        nega_train_overall = []
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            sys.stdout.write("Training :[%6f] %% complete!\r" %
                             ((iter % train_size) * 100.0 / train_size))
            sys.stdout.flush()
            minibatch_index = minibatch_index + 1

            cost_average, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2 = train_model(
                batch_start)
            posi_train_sent.append(simi_sent_level1)
            nega_train_sent.append(simi_sent_level2)
            posi_train_doc.append(simi_doc_level1)
            nega_train_doc.append(simi_doc_level2)
            posi_train_overall.append(simi_overall_level1)
            nega_train_overall.append(simi_overall_level2)
            if iter % n_train_batches == 0:
                corr_train_sent = compute_corr(posi_train_sent,
                                               nega_train_sent)
                corr_train_doc = compute_corr(posi_train_doc, nega_train_doc)
                corr_train_overall = compute_corr(posi_train_overall,
                                                  nega_train_overall)
                print 'training @ iter = ' + str(
                    iter
                ) + ' average cost: ' + str(cost_average) + 'corr rate:' + str(
                    corr_train_sent * 300.0 / train_size) + ' ' + str(
                        corr_train_doc * 300.0 / train_size) + ' ' + str(
                            corr_train_overall * 300.0 / train_size)

            if iter % validation_frequency == 0:
                posi_test_sent = []
                nega_test_sent = []
                posi_test_doc = []
                nega_test_doc = []
                posi_test_overall = []
                nega_test_overall = []
                for i in test_batch_start:
                    cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2 = test_model(
                        i)
                    posi_test_sent.append(simi_sent_level1)
                    nega_test_sent.append(simi_sent_level2)
                    posi_test_doc.append(simi_doc_level1)
                    nega_test_doc.append(simi_doc_level2)
                    posi_test_overall.append(simi_overall_level1)
                    nega_test_overall.append(simi_overall_level2)
                corr_test_sent = compute_corr(posi_test_sent, nega_test_sent)
                corr_test_doc = compute_corr(posi_test_doc, nega_test_doc)
                corr_test_overall = compute_corr(posi_test_overall,
                                                 nega_test_overall)

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc_sent = corr_test_sent * 1.0 / (test_size / 3.0)
                test_acc_doc = corr_test_doc * 1.0 / (test_size / 3.0)
                test_acc_overall = corr_test_overall * 1.0 / (test_size / 3.0)
                #test_acc=1-test_score
                #                 print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                #                            'model %f %%') %
                #                           (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                print '\t\t\tepoch', epoch, ', minibatch', minibatch_index, '/', n_train_batches, 'test acc of best model', test_acc_sent * 100, test_acc_doc * 100, test_acc_overall * 100
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                find_better = False
                if test_acc_sent > max_acc:
                    max_acc = test_acc_sent
                    best_epoch = epoch
                    find_better = True
                if test_acc_doc > max_acc:
                    max_acc = test_acc_doc
                    best_epoch = epoch
                    find_better = True
                if test_acc_overall > max_acc:
                    max_acc = test_acc_overall
                    best_epoch = epoch
                    find_better = True
                print '\t\t\tmax:', max_acc, '(at', best_epoch, ')'
                if find_better == True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #8
0
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[50,50], batch_size=1, window_width=3,
                    maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200,
                    L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0, decay=0.95):
    maxSentLength=max_s_length+2*(window_width-1)
    maxDocLength=max_d_length+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/';
    rng = numpy.random.RandomState(23455)
    train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DQAAAA(rootPath+'vocab_DQAAAA.txt', rootPath+'mc500.train.tsv_standardlized.txt_DQAAAA.txt', rootPath+'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test


    [train_data_D, train_data_Q, train_data_A1, train_data_A2, train_data_A3, train_data_A4, train_Label, 
                 train_Length_D,train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2, train_Length_A3, train_Length_A4,
                train_leftPad_D,train_leftPad_D_s, train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3, train_leftPad_A4,
                train_rightPad_D,train_rightPad_D_s, train_rightPad_Q, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3, train_rightPad_A4]=train_data
    [test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3, test_data_A4, test_Label, 
                 test_Length_D,test_Length_D_s, test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3, test_Length_A4,
                test_leftPad_D,test_leftPad_D_s, test_leftPad_Q, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4,
                test_rightPad_D,test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3, test_rightPad_A4]=test_data                


    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int64')
#     indices_train_r=T.cast(indices_train_r, 'int64')
#     indices_test_l=T.cast(indices_test_l, 'int64')
#     indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DQAAAA_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()   # now, x is the index matrix, must be integer
    index_Q = T.lvector()
    index_A1= T.lvector()
    index_A2= T.lvector()
    index_A3= T.lvector()
    index_A4= T.lvector()
#     y = T.lvector()  
    
    len_D=T.lscalar()
    len_D_s=T.lvector()
    len_Q=T.lscalar()
    len_A1=T.lscalar()
    len_A2=T.lscalar()
    len_A3=T.lscalar()
    len_A4=T.lscalar()

    left_D=T.lscalar()
    left_D_s=T.lvector()
    left_Q=T.lscalar()
    left_A1=T.lscalar()
    left_A2=T.lscalar()
    left_A3=T.lscalar()
    left_A4=T.lscalar()

    right_D=T.lscalar()
    right_D_s=T.lvector()
    right_Q=T.lscalar()
    right_A1=T.lscalar()
    right_A2=T.lscalar()
    right_A3=T.lscalar()
    right_A4=T.lscalar()
        


    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength) # doc shape
    filter_words=(emb_size,window_width)
    filter_sents=(nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1), 'layer0_D_input')#.dimshuffle(0, 'x', 1, 2)
    layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_Q_input')#.dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape((maxSentLength, emb_size)).transpose(), 'layer0_A1_input')#.dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2)
    layer0_A4_input = embeddings[index_A4.flatten()].reshape((maxSentLength, emb_size)).transpose()#.dimshuffle(0, 'x', 1, 2)
    
        
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b] 
#     conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
#     layer2_para=[conv2_W, conv2_b]
#     high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1])
#     highW_para=[high_W, high_b]

    #load_model(params)
    
    
    layer0_D = GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D,:,:],
                                 lefts=left_D_s[left_D:-right_D],
                                 rights=right_D_s[left_D:-right_D],
                                 hidden_dim=nkerns[0],
                                 U=U,W=W,b=b)
    layer0_Q = GRU_Matrix_Input(X=layer0_Q_input[:,left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A1 = GRU_Matrix_Input(X=layer0_A1_input[:,left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A2 = GRU_Matrix_Input(X=layer0_A2_input[:,left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A3 = GRU_Matrix_Input(X=layer0_A3_input[:,left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A4 = GRU_Matrix_Input(X=layer0_A4_input[:,left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)

    
    layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output')
    layer0_Q_output=debug_print(layer0_Q.output_vector_mean, 'layer0_Q.output')
    layer0_A1_output=debug_print(layer0_A1.output_vector_mean, 'layer0_A1.output')
    layer0_A2_output=debug_print(layer0_A2.output_vector_mean, 'layer0_A2.output')
    layer0_A3_output=debug_print(layer0_A3.output_vector_mean, 'layer0_A3.output')
    layer0_A4_output=debug_print(layer0_A4.output_vector_mean, 'layer0_A4.output')
    
    #before reasoning, do a GRU for doc: d
    U_d, W_d, b_d=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_d_para=[U_d, W_d, b_d]
    layer_D_GRU = GRU_Matrix_Input(X=layer0_D_output, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d,W=W_d,b=b_d,bptt_truncate=-1)
    
    #Reasoning Layer 1
    repeat_Q=debug_print(T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0],1)), maxDocLength, axis=1)[:,:layer_D_GRU.output_matrix.shape[1]], 'repeat_Q')
    input_DNN=debug_print(T.concatenate([layer_D_GRU.output_matrix,repeat_Q], axis=0).transpose(), 'input_DNN')#each row is an example
    output_DNN1=HiddenLayer(rng, input=input_DNN, n_in=nkerns[0]*2, n_out=nkerns[0])
    output_DNN2=HiddenLayer(rng, input=output_DNN1.output, n_in=nkerns[0], n_out=nkerns[0])
    
    DNN_out=debug_print(output_DNN2.output.transpose(), 'DNN_out')
    U_p, W_p, b_p=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_pooling_para=[U_p, W_p, b_p] 
    pooling=GRU_Matrix_Input(X=DNN_out, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p,W=W_p,b=b_p,bptt_truncate=-1)
    translated_Q1=debug_print(pooling.output_vector_max, 'translated_Q1')


    #before reasoning, do a GRU for doc: d2
    U_d2, W_d2, b_d2=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_d2_para=[U_d2, W_d2, b_d2]
    layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_d2,W=W_d2,b=b_d2,bptt_truncate=-1)
    #Reasoning Layer 2
    repeat_Q1=debug_print(T.repeat(translated_Q1.reshape((translated_Q1.shape[0],1)), maxDocLength, axis=1)[:,:layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1')
    input_DNN2=debug_print(T.concatenate([layer_D2_GRU.output_matrix,repeat_Q1], axis=0).transpose(), 'input_DNN2')#each row is an example
    output_DNN3=HiddenLayer(rng, input=input_DNN2, n_in=nkerns[0]*2, n_out=nkerns[0])
    output_DNN4=HiddenLayer(rng, input=output_DNN3.output, n_in=nkerns[0], n_out=nkerns[0])
    
    DNN_out2=debug_print(output_DNN4.output.transpose(), 'DNN_out2')
    U_p2, W_p2, b_p2=create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_pooling_para2=[U_p2, W_p2, b_p2] 
    pooling2=GRU_Matrix_Input(X=DNN_out2, word_dim=nkerns[0], hidden_dim=nkerns[0],U=U_p2,W=W_p2,b=b_p2,bptt_truncate=-1)
    translated_Q2=debug_print(pooling2.output_vector_max, 'translated_Q2')
    

    QA1=T.concatenate([translated_Q2, layer0_A1_output], axis=0)
    QA2=T.concatenate([translated_Q2, layer0_A2_output], axis=0)
    QA3=T.concatenate([translated_Q2, layer0_A3_output], axis=0)
    QA4=T.concatenate([translated_Q2, layer0_A4_output], axis=0)
    
    W_HL,b_HL=create_HiddenLayer_para(rng, n_in=nkerns[0]*2, n_out=1)
    match_params=[W_HL,b_HL]
    QA1_match=HiddenLayer(rng, input=QA1, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    QA2_match=HiddenLayer(rng, input=QA2, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    QA3_match=HiddenLayer(rng, input=QA3, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    QA4_match=HiddenLayer(rng, input=QA4, n_in=nkerns[0]*2, n_out=1, W=W_HL, b=b_HL)
    
    
    
#     simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1')
#     simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2')
#     simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3')
#     simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4')

    simi_overall_level1=debug_print(QA1_match.output[0], 'simi_overall_level1')
    simi_overall_level2=debug_print(QA2_match.output[0], 'simi_overall_level2')
    simi_overall_level3=debug_print(QA3_match.output[0], 'simi_overall_level3')
    simi_overall_level4=debug_print(QA4_match.output[0], 'simi_overall_level4')


#     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))
 
    #only use overall_simi    
    cost=T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level3-simi_overall_level1)+T.maximum(0.0, margin+simi_overall_level4-simi_overall_level1)
    
#     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    posi_simi=simi_overall_level1
    nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
#     #use ensembled simi
#     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
#     posi_simi=simi_1
#     nega_simi=T.max([simi_2, simi_3, simi_4])


    
    L2_reg =debug_print((U**2).sum()+(W**2).sum()
                        +(U_p**2).sum()+(W_p**2).sum()
                        +(U_p2**2).sum()+(W_p2**2).sum()
                        +(U_d**2).sum()+(W_d**2).sum()
                        +(U_d2**2).sum()+(W_d2**2).sum()
                        +(output_DNN1.W**2).sum()+(output_DNN2.W**2).sum()
                        +(output_DNN3.W**2).sum()+(output_DNN4.W**2).sum()
                        +(W_HL**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost=debug_print(cost+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    


    
    test_model = theano.function([index], [cost, posi_simi, nega_simi],
          givens={
            index_D: test_data_D[index], #a matrix
            index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
            index_A4: test_data_A4[index],

            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
            len_A4: test_Length_A4[index],

            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
            left_A4: test_leftPad_A4[index],
        
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index],
            right_A4: test_rightPad_A4[index]
            
            }, on_unused_input='ignore')


    params = layer0_para+output_DNN1.params+output_DNN2.params+output_DNN3.params+output_DNN4.params+layer_pooling_para+layer_pooling_para2+match_params+layer_d_para+layer_d2_para
    
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)


#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6)))   
#         updates.append((acc_i, acc))      
 
    def AdaDelta_updates(parameters,gradients,rho,eps):
        # create variables to store intermediate updates
        gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ]
        deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ]
     
        # calculates the new "average" delta for the next iteration
        gradients_sq_new = [ rho*g_sq + (1-rho)*(g**2) for g_sq,g in zip(gradients_sq,gradients) ]
     
        # calculates the step in direction. The square root is an approximation to getting the RMS for the average value
        deltas = [ (T.sqrt(d_sq+eps)/T.sqrt(g_sq+eps))*grad for d_sq,g_sq,grad in zip(deltas_sq,gradients_sq_new,gradients) ]
     
        # calculates the new "average" deltas for the next step.
        deltas_sq_new = [ rho*d_sq + (1-rho)*(d**2) for d_sq,d in zip(deltas_sq,deltas) ]
     
        # Prepare it as a list f
        gradient_sq_updates = zip(gradients_sq,gradients_sq_new)
        deltas_sq_updates = zip(deltas_sq,deltas_sq_new)
        parameters_updates = [ (p,p - d) for p,d in zip(parameters,deltas) ]
        return gradient_sq_updates + deltas_sq_updates + parameters_updates   
    
    updates=AdaDelta_updates(params, grads, decay, 1e-6)
  
    train_model = theano.function([index], [cost, posi_simi, nega_simi], updates=updates,
          givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost, posi_simi, nega_simi],
          givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
        
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
#         shuffle(train_batch_start)#shuffle training data


        corr_train=0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
            sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) )
            sys.stdout.flush()

            minibatch_index=minibatch_index+1
            
            cost_average, posi_simi, nega_simi= train_model(batch_start)
            if posi_simi>nega_simi:
                corr_train+=1
            
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train*100.0/train_size)

            
            if iter % validation_frequency == 0:
                corr_test=0
                for i in test_batch_start:
                    cost, posi_simi, nega_simi=test_model(i)
                    if posi_simi>nega_simi:
                        corr_test+=1

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc=corr_test*1.0/test_size
                #test_acc=1-test_score
                print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 

  
                find_better=False
                if test_acc > max_acc:
                    max_acc=test_acc
                    best_epoch=epoch    
                    find_better=True             
                print '\t\t\ttest_acc:', test_acc, 'max:',    max_acc,'(at',best_epoch,')'
                if find_better==True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'  

            if patience <= iter:
                done_looping = True
                break
        
        
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
        #writefile.close()
   
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #9
0
ファイル: train_KBEmbedding.py プロジェクト: yinwenpeng/KBEmb
def evaluate_lenet5(learning_rate=0.08, n_epochs=2000, nkerns=[50], batch_size=1000, window_width=4,
                    maxSentLength=64, emb_size=5, hidden_size=50,
                    margin=0.5, L2_weight=0.0004, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=483142):
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/FB15k/'
    rng = numpy.random.RandomState(1234)
    triples, entity_size, relation_size, entity_count, relation_count=load_triples(triple_path+'freebase_mtr100_mte100-train.txt', line_no, triple_path)#vocab_size contain train, dev and test
    print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count)

#     print triples
#     print entity_count
#     print relation_count
#     exit(0)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
#     mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
#     mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt')
#     wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

    
    entity_count=theano.shared(numpy.asarray(entity_count, dtype=theano.config.floatX), borrow=True)
    entity_count=T.cast(entity_count, 'int64')
    relation_count=theano.shared(numpy.asarray(relation_count, dtype=theano.config.floatX), borrow=True)
    relation_count=T.cast(relation_count, 'int64')    


    rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    entity_E=theano.shared(value=rand_values, borrow=True)      
    rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
    relation_E=theano.shared(value=rand_values, borrow=True)    
    
    GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
    GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=2) 
    #cost_tmp=0
    
    n_batchs=line_no/batch_size
    remain_triples=line_no%batch_size
    if remain_triples>0:
        batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size]
    else:
        batch_start=list(numpy.arange(n_batchs)*batch_size)
    batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True)
    batch_start=T.cast(batch_start, 'int64')   
    
    # allocate symbolic variables for the data
#     index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
#     x_index_r = T.imatrix('x_index_r')
#     y = T.ivector('y')  
#     left_l=T.iscalar()
#     right_l=T.iscalar()
#     left_r=T.iscalar()
#     right_r=T.iscalar()
#     length_l=T.iscalar()
#     length_r=T.iscalar()
#     norm_length_l=T.fscalar()
#     norm_length_r=T.fscalar()
#     mts=T.fmatrix()
#     wmf=T.fmatrix()
#     cost_tmp=T.fscalar()
#     #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
#     ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
#     filter_size=(emb_size,window_width)
#     #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    
    zero_entity_E=T.zeros((entity_size, emb_size))  
    zero_relation_E=T.zeros((relation_size, emb_size))      
    entity_E_hat_1, relation_E_hat_1=all_batches(batch_start, batch_size, x_index_l, entity_E, relation_E, GRU_U, GRU_W, GRU_b, emb_size, zero_entity_E,zero_relation_E, entity_count, entity_size, relation_count, relation_size)
#     for start in batch_start:
#         batch_triple_indices=x_index_l[start:start+batch_size]
# #         entity_E_hat_1, relation_E_hat_1=one_iteration_parallel(batch_triple_indices, entity_E, relation_E, GRU_U, GRU_W, GRU_b, emb_size, entity_size, relation_size, entity_count, relation_count)     
#         new_entity_E,new_relation_E=one_batch_parallel(batch_triple_indices, entity_E, relation_E, GRU_U, GRU_W, GRU_b, emb_size, new_entity_E,new_relation_E)
# 
#     entity_count=debug_print(entity_count.reshape((entity_size,1)), 'entity_count')
#     relation_count=debug_print(relation_count.reshape((relation_size, 1)), 'relation_count')
#     entity_E_hat_1=debug_print(new_entity_E/entity_count+1e-6, 'entity_E_hat_1') #to get rid of zero incoming info
#     relation_E_hat_1=debug_print(new_relation_E/relation_count, 'relation_E_hat_1')
#     
#     entity_E_hat_1, relation_E_hat_1=one_iteration_parallel(x_index_l, entity_E, relation_E, GRU_U, GRU_W, GRU_b, emb_size, entity_size, relation_size, entity_count, relation_count)     
#     
    entity_E_updated_1=GRU_Combine_2Matrix(entity_E, entity_E_hat_1, emb_size, GRU_U_combine[0], GRU_W_combine[0], GRU_b_combine[0])
    relation_E_updated_1=GRU_Combine_2Matrix(relation_E, relation_E_hat_1, emb_size, GRU_U_combine[1], GRU_W_combine[1], GRU_b_combine[1])
#     cost=((entity_E_hat_1-entity_E)**2).sum()+((relation_E_hat_1-relation_E)**2).sum()
    cost_1=((entity_E_updated_1-entity_E)**2).sum()+((relation_E_updated_1-relation_E)**2).sum()


    entity_E_hat_2, relation_E_hat_2=all_batches(batch_start, batch_size, x_index_l, entity_E_updated_1, relation_E_updated_1, GRU_U, GRU_W, GRU_b, emb_size, zero_entity_E,zero_relation_E, entity_count, entity_size, relation_count, relation_size)    
#     entity_E_hat_2, relation_E_hat_2=one_iteration_parallel(x_index_l, entity_E_updated_1, relation_E_updated_1, GRU_U, GRU_W, GRU_b, emb_size, entity_size, relation_size, entity_count, relation_count)
    entity_E_last_2=GRU_Combine_2Matrix(entity_E_updated_1, entity_E_hat_2, emb_size, GRU_U_combine[0], GRU_W_combine[0], GRU_b_combine[0])
    relation_E_last_2=GRU_Combine_2Matrix(relation_E_updated_1, relation_E_hat_2, emb_size, GRU_U_combine[1], GRU_W_combine[1], GRU_b_combine[1])    
     
    L2_loss=debug_print((entity_E** 2).sum()+(relation_E** 2).sum()\
                      +(GRU_U** 2).sum()+(GRU_W** 2).sum()\
                      +(GRU_U_combine** 2).sum()+(GRU_W_combine** 2).sum(), 'L2_reg')
    cost_sys=((entity_E_last_2-entity_E_updated_1)**2).sum()+((relation_E_last_2-relation_E_updated_1)**2).sum()
    cost=cost_sys+L2_weight*L2_loss
    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [entity_E, relation_E, GRU_U, GRU_W, GRU_b, GRU_U_combine, GRU_W_combine, GRU_b_combine]
#     params_conv = [conv_W, conv_b]
    params_to_store=[GRU_U, GRU_W, GRU_b, GRU_U_combine, GRU_W_combine, GRU_b_combine]#, entity_E_last_2, relation_E_last_2]
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))    
  
    train_model = theano.function([x_index_l], [cost_1,cost_sys, entity_E_last_2, relation_E_last_2], updates=updates,on_unused_input='ignore')
# 
#     train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
#           givens={
#             x_index_l: indices_train_l[index: index + batch_size],
#             x_index_r: indices_train_r[index: index + batch_size],
#             y: trainY[index: index + batch_size],
#             left_l: trainLeftPad_l[index],
#             right_l: trainRightPad_l[index],
#             left_r: trainLeftPad_r[index],
#             right_r: trainRightPad_r[index],
#             length_l: trainLengths_l[index],
#             length_r: trainLengths_r[index],
#             norm_length_l: normalized_train_length_l[index],
#             norm_length_r: normalized_train_length_r[index],
#             mts: mt_train[index: index + batch_size],
#             wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
#     validation_frequency = min(n_train_batches/5, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        cost_1, cost_l, entity_E_store, relation_E_store= train_model(triples)
                #print 'layer3_input', layer3_input
        print 'epoch:', epoch, 'cost:', cost_1, cost_l

#             if patience <= iter:
#                 done_looping = True
#                 break
        #after each epoch, increase the batch_size

#         exit(0)
        
        #store the paras after epoch 15
#         if epoch ==22:
        entity_E_store=theano.shared(numpy.asarray(entity_E_store, dtype=theano.config.floatX), borrow=True)
        relation_E_store=theano.shared(numpy.asarray(relation_E_store, dtype=theano.config.floatX), borrow=True)
        params_to_store=params_to_store+[entity_E_store, relation_E_store]
        store_model_to_file(triple_path+'Best_Paras', params_to_store)
        print 'Finished storing best  params'
#             exit(0)
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #10
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=500, test_batch_size=500, emb_size=300, hidden_size=300,
                    L2_weight=0.0001, margin=0.5,
                    train_size=4000000, test_size=1000, 
                    max_context_len=25, max_span_len=7, max_q_len=40, max_EM=0.052):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = np.random.RandomState(23455)
    word2id,train_questions,train_questions_mask,train_lefts,train_lefts_mask,train_spans,train_spans_mask,train_rights,train_rights_mask=load_SQUAD_hinrich(train_size, max_context_len, max_span_len, max_q_len)



    test_ground_truth,test_candidates,test_questions,test_questions_mask,test_lefts,test_lefts_mask,test_spans,test_spans_mask,test_rights,test_rights_mask=load_dev_hinrich(word2id, test_size, max_context_len, max_span_len, max_q_len)
    
    
    
    

    overall_vocab_size=len(word2id)
    print 'vocab size:', overall_vocab_size


    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, np.random.RandomState(1234))
#     rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()

    left=T.imatrix()  #(2*batch, len)
    left_mask=T.fmatrix() #(2*batch, len)
    span=T.imatrix()  #(2*batch, span_len)
    span_mask=T.fmatrix() #(2*batch, span_len)
    right=T.imatrix()  #(2*batch, len)
    right_mask=T.fmatrix() #(2*batch, len)
    q=T.imatrix()  #(2*batch, len_q)
    q_mask=T.fmatrix() #(2*batch, len_q)





    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    GRU1_para=[U1, W1, b1, U1_b, W1_b, b1_b]
    
    U2, W2, b2=create_GRU_para(rng, hidden_size, hidden_size)
    U2_b, W2_b, b2_b=create_GRU_para(rng, hidden_size, hidden_size)
    GRU2_para=[U2, W2, b2, U2_b, W2_b, b2_b]
    
    W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)

    attend_para=[W_a1, W_a2]
    params = [embeddings]+GRU1_para+attend_para+GRU2_para
#     load_model_from_file(rootPath+'Best_Para_dim'+str(emb_size), params)

    left_input = embeddings[left.flatten()].reshape((left.shape[0], left.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context)
    span_input = embeddings[span.flatten()].reshape((span.shape[0], span.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_span)
    right_input = embeddings[right.flatten()].reshape((right.shape[0], right.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context)
    q_input = embeddings[q.flatten()].reshape((q.shape[0], q.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_q)


    left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=left_input, Mask=left_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    left_reps=left_model.output_tensor #(batch, emb, para_len)

    span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=span_input, Mask=span_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    span_reps=span_model.output_tensor #(batch, emb, para_len)

    right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=right_input, Mask=right_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    right_reps=right_model.output_tensor #(batch, emb, para_len)

    q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    q_reps=q_model.output_tensor #(batch, emb, para_len)

    #interaction
    left_reps_via_q_reps, q_reps_via_left_reps=attention_dot_prod_between_2tensors(left_reps, q_reps)
    span_reps_via_q_reps, q_reps_via_span_reps=attention_dot_prod_between_2tensors(span_reps, q_reps)
    right_reps_via_q_reps, q_reps_via_right_reps=attention_dot_prod_between_2tensors(right_reps, q_reps)

#     q_reps_via_left_reps=attention_dot_prod_between_2tensors(q_reps, left_reps)
#     q_reps_via_span_reps=attention_dot_prod_between_2tensors(q_reps, span_reps)
#     q_reps_via_right_reps=attention_dot_prod_between_2tensors(q_reps, right_reps)

    #combine


    origin_W=normalize_matrix(W_a1)
    attend_W=normalize_matrix(W_a2)

    left_origin_reps=T.dot(left_reps.dimshuffle(0, 2,1), origin_W)
    span_origin_reps=T.dot(span_reps.dimshuffle(0, 2,1), origin_W)
    right_origin_reps=T.dot(right_reps.dimshuffle(0, 2,1), origin_W)
    q_origin_reps=T.dot(q_reps.dimshuffle(0, 2,1), origin_W)

    left_attend_q_reps=T.dot(q_reps_via_left_reps.dimshuffle(0, 2,1), attend_W)
    span_attend_q_reps=T.dot(q_reps_via_span_reps.dimshuffle(0, 2,1), attend_W)
    right_attend_q_reps=T.dot(q_reps_via_right_reps.dimshuffle(0, 2,1), attend_W)

    q_attend_left_reps=T.dot(left_reps_via_q_reps.dimshuffle(0, 2,1), attend_W)
    q_attend_span_reps=T.dot(span_reps_via_q_reps.dimshuffle(0, 2,1), attend_W)
    q_attend_right_reps=T.dot(right_reps_via_q_reps.dimshuffle(0, 2,1), attend_W)


    add_left=left_origin_reps+q_attend_left_reps  #(2*batch, len ,hidden)
    add_span=span_origin_reps+q_attend_span_reps
    add_right=right_origin_reps+q_attend_right_reps

    add_q_by_left=q_origin_reps+left_attend_q_reps
    add_q_by_span=q_origin_reps+span_attend_q_reps
    add_q_by_right=q_origin_reps+right_attend_q_reps

    #second GRU


    add_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_left.dimshuffle(0,2,1), Mask=left_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_left_reps=add_left_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_span.dimshuffle(0,2,1), Mask=span_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_span_reps=add_span_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_right.dimshuffle(0,2,1), Mask=right_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_right_reps=add_right_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_q_by_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_left.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_q_by_left_reps=add_q_by_left_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_q_by_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_span.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_q_by_span_reps=add_q_by_span_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_q_by_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_right.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_q_by_right_reps=add_q_by_right_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    paragraph_concat=T.concatenate([add_left_reps, add_span_reps, add_right_reps], axis=1) #(batch, 3*hidden)
    question_concat=T.concatenate([add_q_by_left_reps, add_q_by_span_reps, add_q_by_right_reps], axis=1)   #(batch, 3*hidden)

    simi_list=cosine_row_wise_twoMatrix(paragraph_concat, question_concat)  #(2*batch)

    pos_simi_vec=simi_list[::2]
    neg_simi_vec=simi_list[1::2]

    raw_loss=T.maximum(0.0, margin+neg_simi_vec-pos_simi_vec)



    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    
#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=T.sum(raw_loss)#+ConvGRU_1.error#


    accumulator=[]
    for para_i in params:
        eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))



    train_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], simi_list, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size    #batch_size means how many pairs
    remain_train=train_size%batch_size
#     train_batch_start=list(np.arange(n_train_batches)*batch_size*2)+[train_size*2-batch_size*2] # always ou shu
    if remain_train>0:
        train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] 
    else:
        train_batch_start=list(np.arange(n_train_batches)*batch_size)




    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_odd_ids = list(np.arange(train_size)*2)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        random.shuffle(train_odd_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_list=[[train_odd_id, train_odd_id+1] for train_odd_id in train_odd_ids[para_id:para_id+batch_size]]
            train_id_list=sum(train_id_list,[])
#             print train_id_list
            cost_i+= train_model(
                                np.asarray([train_lefts[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_lefts_mask[id] for id in train_id_list], dtype=theano.config.floatX),
                                np.asarray([train_spans[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_spans_mask[id] for id in train_id_list], dtype=theano.config.floatX),
                                np.asarray([train_rights[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_rights_mask[id] for id in train_id_list], dtype=theano.config.floatX),
                                np.asarray([train_questions[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_questions_mask[id] for id in train_id_list], dtype=theano.config.floatX))

            #print iter
            if iter%100==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()

                exact_match=0.0
                F1_match=0.0


                for test_pair_id in range(test_size):
                    test_example_lefts=test_lefts[test_pair_id]
                    test_example_lefts_mask=test_lefts_mask[test_pair_id]
                    test_example_spans=test_spans[test_pair_id]
                    test_example_spans_mask=test_spans_mask[test_pair_id]
                    test_example_rights=test_rights[test_pair_id]
                    test_example_rights_mask=test_rights_mask[test_pair_id]
                    test_example_questions=test_questions[test_pair_id]
                    test_example_questions_mask=test_questions_mask[test_pair_id]       
                    test_example_candidates=test_candidates[test_pair_id]
                    
                    
                    
                    test_example_size=len(test_example_lefts)
#                     print 'test_pair_id, test_example_size:', test_pair_id, test_example_size
                    if test_example_size < test_batch_size:
                        #pad
                        pad_size=test_batch_size-test_example_size
                        test_example_lefts+=test_example_lefts[-1:]*pad_size
                        test_example_lefts_mask+=test_example_lefts_mask[-1:]*pad_size
                        test_example_spans+=test_example_spans[-1:]*pad_size
                        test_example_spans_mask+=test_example_spans_mask[-1:]*pad_size
                        test_example_rights+=test_example_rights[-1:]*pad_size
                        test_example_rights_mask+=test_example_rights_mask[-1:]*pad_size
                        test_example_questions+=test_example_questions[-1:]*pad_size
                        test_example_questions_mask+=test_example_questions_mask[-1:]*pad_size 
                        test_example_candidates+=test_example_candidates[-1:]*pad_size
                        
                        test_example_size=test_batch_size
                    
                                            
                    n_test_batches=test_example_size/test_batch_size
                    n_test_remain=test_example_size%test_batch_size
                    if n_test_remain > 0:
                        test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_example_size-test_batch_size]
                    else:
                        test_batch_start=list(np.arange(n_test_batches)*test_batch_size)
                    all_simi_list=[]
                    all_cand_list=[]
                    for test_para_id in test_batch_start:
                        simi_return_vector=test_model(
                                    np.asarray(test_example_lefts[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_lefts_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                    np.asarray(test_example_spans[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_spans_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                    np.asarray(test_example_rights[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_rights_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                    np.asarray(test_example_questions[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_questions_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX))
                        candidate_list=test_example_candidates[test_para_id:test_para_id+test_batch_size]
                        all_simi_list+=list(simi_return_vector)
                        all_cand_list+=candidate_list
                    top1_cand=all_cand_list[np.argsort(all_simi_list)[-1]]
#                     print top1_cand, test_ground_truth[test_pair_id]

                    if top1_cand == test_ground_truth[test_pair_id]:
                        exact_match+=1
                    F1=macrof1(top1_cand, test_ground_truth[test_pair_id])
#                     print '\t\t\t', F1
                    F1_match+=F1
#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                F1_acc=F1_match/test_size
                exact_acc=exact_match/test_size
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
#                     store_model_to_file(params, emb_size)
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(rootPath+'Best_Para_'+str(max_EM), params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc




            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #11
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    emb_size=40,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    all_sentences, all_masks, all_labels, all_other_labels, word2id = load_BBN_il5Trans_il5_dataset(
        maxlen=maxSentLen
    )  #minlen, include one label, at least one word in the sentence
    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_sents = np.asarray(all_sentences[0], dtype='int32')
    train_masks = np.asarray(all_masks[0], dtype=theano.config.floatX)
    train_labels = np.asarray(all_labels[0], dtype='int32')
    train_size = len(train_labels)

    dev_sents = np.asarray(all_sentences[1], dtype='int32')
    dev_masks = np.asarray(all_masks[1], dtype=theano.config.floatX)
    dev_labels = np.asarray(all_labels[1], dtype='int32')
    dec_other_labels = np.asarray(all_other_labels, dtype='int32')
    dev_size = len(dev_labels)
    '''
    combine train and dev
    '''
    train_sents = np.concatenate([train_sents, dev_sents], axis=0)
    train_masks = np.concatenate([train_masks, dev_masks], axis=0)
    train_labels = np.concatenate([train_labels, dev_labels], axis=0)
    train_size = train_size + dev_size

    test_sents = np.asarray(all_sentences[2], dtype='int32')
    test_masks = np.asarray(all_masks[2], dtype=theano.config.floatX)
    test_labels = np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_labels)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + 'IL5-cca-wiki-lorelei-d40.eng.vec',
        emb_root + 'IL5-cca-wiki-lorelei-d40.IL5.vec'
    ], 40)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    LR_input = T.concatenate([sent_embeddings, sent_embeddings2, bow_emb],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    # U_a = create_ensemble_para(rng, 12, LR_input_size) # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((12,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    # LR_para=[U_a, LR_b]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    # score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    # prob_pos = T.where( labels < 1, 1.0-score_matrix, score_matrix)
    # loss = -T.mean(T.log(prob_pos))

    # U_a = create_ensemble_para(rng, 12+12, LR_input_size) # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((12+12,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    U_a, LR_b = create_LR_para(rng, LR_input_size, 12)
    LR_para = [U_a, LR_b]
    layer_LR = LogisticRegression(
        rng, input=LR_input, n_in=LR_input_size, n_out=12, W=U_a, b=LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(layer_LR.before_softmax)  #batch * 12
    prob_pos = T.where(labels < 1, 1.0 - score_matrix, score_matrix)
    loss = -T.mean(T.log(prob_pos))

    other_U_a, other_LR_b = create_LR_para(rng, LR_input_size, 16)
    other_LR_para = [other_U_a, other_LR_b]
    other_layer_LR = LogisticRegression(rng,
                                        input=LR_input,
                                        n_in=LR_input_size,
                                        n_out=16,
                                        W=other_U_a,
                                        b=other_LR_b)
    other_prob_matrix = T.nnet.softmax(
        other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    other_prob_tensor3 = other_prob_matrix.reshape((batch_size, 4, 4))
    other_prob = other_prob_tensor3[T.repeat(T.arange(batch_size), 4),
                                    T.tile(T.arange(4), (batch_size)),
                                    other_labels.flatten()]
    other_field_loss = -T.mean(T.log(other_prob))
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)

    LR_att_input = T.concatenate([gru_sent_embeddings, bow_emb], axis=1)
    LR_att_input_size = hidden_size[0] + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_att_a, LR_att_b = create_LR_para(rng, LR_att_input_size, 12)
    LR_att_para = [U_att_a, LR_att_b]
    layer_att_LR = LogisticRegression(
        rng,
        input=LR_att_input,
        n_in=LR_att_input_size,
        n_out=12,
        W=U_att_a,
        b=LR_att_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    att_score_matrix = T.nnet.sigmoid(layer_att_LR.before_softmax)  #batch * 12
    att_prob_pos = T.where(labels < 1, 1.0 - att_score_matrix,
                           att_score_matrix)
    att_loss = -T.mean(T.log(att_prob_pos))

    att_other_U_a, att_other_LR_b = create_LR_para(rng, LR_att_input_size, 16)
    att_other_LR_para = [att_other_U_a, att_other_LR_b]
    att_other_layer_att_LR = LogisticRegression(rng,
                                                input=LR_att_input,
                                                n_in=LR_att_input_size,
                                                n_out=16,
                                                W=att_other_U_a,
                                                b=att_other_LR_b)
    att_other_prob_matrix = T.nnet.softmax(
        att_other_layer_att_LR.before_softmax.reshape((batch_size * 4, 4)))
    att_other_prob_tensor3 = att_other_prob_matrix.reshape((batch_size, 4, 4))
    att_other_prob = att_other_prob_tensor3[T.repeat(T.arange(batch_size), 4),
                                            T.tile(T.arange(4), (batch_size)),
                                            other_labels.flatten()]
    att_other_field_loss = -T.mean(T.log(att_other_prob))
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    acnn_LR_input = T.concatenate(
        [sent_att_embeddings, sent_att_embeddings2, bow_emb], axis=1)
    acnn_LR_input_size = hidden_size[0] * 2 + emb_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    params = multiCNN_para + LR_para + GRU_NN_para + LR_att_para + ACNN_para + acnn_LR_para  # put all model parameters together
    cost = loss + att_loss + acnn_loss + 1e-4 * ((conv_W**2).sum() +
                                                 (conv_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + other_LR_para + att_other_LR_para + acnn_other_LR_para
    cost_other = cost + other_field_loss + att_other_field_loss + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = T.max(T.concatenate([
        att_score_matrix.dimshuffle('x', 0, 1),
        score_matrix.dimshuffle('x', 0, 1),
        acnn_score_matrix.dimshuffle('x', 0, 1)
    ],
                                             axis=0),
                               axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = 0.6 * ensemble_NN_scores + 0.4 * 0.5 * (
        cosine_score_matrix + top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    other_model = theano.function(
        [sents_id_matrix, sents_mask, labels, other_labels],
        cost_other,
        updates=other_updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        binarize_prob,
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_dev_batches = dev_size / batch_size
    dev_batch_start = list(
        np.arange(n_dev_batches) * batch_size) + [dev_size - batch_size]
    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    dev_batch_start_set = set(dev_batch_start)
    # max_acc_dev=0.0
    max_meanf1_test = 0.0
    max_weightf1_test = 0.0
    train_indices = range(train_size)
    dev_indices = range(dev_size)
    cost_i = 0.0
    other_cost_i = 0.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(dev_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_model(train_sents[train_id_batch],
                                  train_masks[train_id_batch],
                                  train_labels[train_id_batch], label_sent,
                                  label_mask)

            if batch_id in dev_batch_start_set:
                dev_id_batch = dev_indices[batch_id:batch_id + batch_size]
                other_cost_i += other_model(dev_sents[dev_id_batch],
                                            dev_masks[dev_id_batch],
                                            dev_labels[dev_id_batch],
                                            dec_other_labels[dev_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                error_sum = 0.0
                all_pred_labels = []
                all_gold_labels = []
                for test_batch_id in test_batch_start:  # for each test batch
                    pred_labels = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    gold_labels = test_labels[test_batch_id:test_batch_id +
                                              batch_size]
                    # print 'pred_labels:', pred_labels
                    # print 'gold_labels;', gold_labels
                    all_pred_labels.append(pred_labels)
                    all_gold_labels.append(gold_labels)
                all_pred_labels = np.concatenate(all_pred_labels)
                all_gold_labels = np.concatenate(all_gold_labels)

                test_mean_f1, test_weight_f1 = average_f1_two_array_by_col(
                    all_pred_labels, all_gold_labels)
                if test_weight_f1 > max_weightf1_test:
                    max_weightf1_test = test_weight_f1
                if test_mean_f1 > max_meanf1_test:
                    max_meanf1_test = test_mean_f1
                print '\t\t\t\t\t\t\t\tcurrent f1s:', test_mean_f1, test_weight_f1, '\t\tmax_f1:', max_meanf1_test, max_weightf1_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #12
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    nkerns=[50, 50],
                    batch_size=1,
                    window_width=3,
                    maxSentLength=64,
                    maxDocLength=60,
                    emb_size=50,
                    hidden_size=200,
                    L2_weight=0.0065,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_s_length=57,
                    max_d_length=59,
                    margin=1.0,
                    decay=0.95):
    maxSentLength = max_s_length + 2 * (window_width - 1)
    maxDocLength = max_d_length + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/MCTest/'
    rng = numpy.random.RandomState(23455)
    train_data, train_size, test_data, test_size, vocab_size = load_MCTest_corpus_DQAAAA(
        rootPath + 'vocab_DQAAAA.txt',
        rootPath + 'mc500.train.tsv_standardlized.txt_DQAAAA.txt',
        rootPath + 'mc500.test.tsv_standardlized.txt_DQAAAA.txt', max_s_length,
        maxSentLength, maxDocLength)  #vocab_size contain train, dev and test

    [
        train_data_D, train_data_Q, train_data_A1, train_data_A2,
        train_data_A3, train_data_A4, train_Label, train_Length_D,
        train_Length_D_s, train_Length_Q, train_Length_A1, train_Length_A2,
        train_Length_A3, train_Length_A4, train_leftPad_D, train_leftPad_D_s,
        train_leftPad_Q, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3,
        train_leftPad_A4, train_rightPad_D, train_rightPad_D_s,
        train_rightPad_Q, train_rightPad_A1, train_rightPad_A2,
        train_rightPad_A3, train_rightPad_A4
    ] = train_data
    [
        test_data_D, test_data_Q, test_data_A1, test_data_A2, test_data_A3,
        test_data_A4, test_Label, test_Length_D, test_Length_D_s,
        test_Length_Q, test_Length_A1, test_Length_A2, test_Length_A3,
        test_Length_A4, test_leftPad_D, test_leftPad_D_s, test_leftPad_Q,
        test_leftPad_A1, test_leftPad_A2, test_leftPad_A3, test_leftPad_A4,
        test_rightPad_D, test_rightPad_D_s, test_rightPad_Q, test_rightPad_A1,
        test_rightPad_A2, test_rightPad_A3, test_rightPad_A4
    ] = test_data

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    #     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    #     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    #     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    #     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    #     indices_train_l=T.cast(indices_train_l, 'int64')
    #     indices_train_r=T.cast(indices_train_r, 'int64')
    #     indices_test_l=T.cast(indices_test_l, 'int64')
    #     indices_test_r=T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(
        rand_values, rootPath + 'vocab_DQAAAA_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()  # now, x is the index matrix, must be integer
    index_Q = T.lvector()
    index_A1 = T.lvector()
    index_A2 = T.lvector()
    index_A3 = T.lvector()
    index_A4 = T.lvector()
    #     y = T.lvector()

    len_D = T.lscalar()
    len_D_s = T.lvector()
    len_Q = T.lscalar()
    len_A1 = T.lscalar()
    len_A2 = T.lscalar()
    len_A3 = T.lscalar()
    len_A4 = T.lscalar()

    left_D = T.lscalar()
    left_D_s = T.lvector()
    left_Q = T.lscalar()
    left_A1 = T.lscalar()
    left_A2 = T.lscalar()
    left_A3 = T.lscalar()
    left_A4 = T.lscalar()

    right_D = T.lscalar()
    right_D_s = T.lvector()
    right_Q = T.lscalar()
    right_A1 = T.lscalar()
    right_A2 = T.lscalar()
    right_A3 = T.lscalar()
    right_A4 = T.lscalar()

    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength)  # doc shape
    filter_words = (emb_size, window_width)
    filter_sents = (nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = debug_print(embeddings[index_D.flatten()].reshape(
        (maxDocLength, maxSentLength, emb_size)).transpose(0, 2, 1),
                                 'layer0_D_input')  #.dimshuffle(0, 'x', 1, 2)
    layer0_Q_input = debug_print(embeddings[index_Q.flatten()].reshape(
        (maxSentLength, emb_size)).transpose(),
                                 'layer0_Q_input')  #.dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = debug_print(embeddings[index_A1.flatten()].reshape(
        (maxSentLength,
         emb_size)).transpose(), 'layer0_A1_input')  #.dimshuffle(0, 'x', 1, 2)
    layer0_A2_input = embeddings[index_A2.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()  #.dimshuffle(0, 'x', 1, 2)
    layer0_A3_input = embeddings[index_A3.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()  #.dimshuffle(0, 'x', 1, 2)
    layer0_A4_input = embeddings[index_A4.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()  #.dimshuffle(0, 'x', 1, 2)

    U, W, b = create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para = [U, W, b]
    #     conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
    #     layer2_para=[conv2_W, conv2_b]
    #     high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1])
    #     highW_para=[high_W, high_b]

    #load_model(params)

    layer0_D = GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D, :, :],
                                 lefts=left_D_s[left_D:-right_D],
                                 rights=right_D_s[left_D:-right_D],
                                 hidden_dim=nkerns[0],
                                 U=U,
                                 W=W,
                                 b=b)
    layer0_Q = GRU_Matrix_Input(X=layer0_Q_input[:, left_Q:-right_Q],
                                word_dim=emb_size,
                                hidden_dim=nkerns[0],
                                U=U,
                                W=W,
                                b=b,
                                bptt_truncate=-1)
    layer0_A1 = GRU_Matrix_Input(X=layer0_A1_input[:, left_A1:-right_A1],
                                 word_dim=emb_size,
                                 hidden_dim=nkerns[0],
                                 U=U,
                                 W=W,
                                 b=b,
                                 bptt_truncate=-1)
    layer0_A2 = GRU_Matrix_Input(X=layer0_A2_input[:, left_A2:-right_A2],
                                 word_dim=emb_size,
                                 hidden_dim=nkerns[0],
                                 U=U,
                                 W=W,
                                 b=b,
                                 bptt_truncate=-1)
    layer0_A3 = GRU_Matrix_Input(X=layer0_A3_input[:, left_A3:-right_A3],
                                 word_dim=emb_size,
                                 hidden_dim=nkerns[0],
                                 U=U,
                                 W=W,
                                 b=b,
                                 bptt_truncate=-1)
    layer0_A4 = GRU_Matrix_Input(X=layer0_A4_input[:, left_A4:-right_A4],
                                 word_dim=emb_size,
                                 hidden_dim=nkerns[0],
                                 U=U,
                                 W=W,
                                 b=b,
                                 bptt_truncate=-1)

    layer0_D_output = debug_print(layer0_D.output, 'layer0_D.output')
    layer0_Q_output = debug_print(layer0_Q.output_vector_mean,
                                  'layer0_Q.output')
    layer0_A1_output = debug_print(layer0_A1.output_vector_mean,
                                   'layer0_A1.output')
    layer0_A2_output = debug_print(layer0_A2.output_vector_mean,
                                   'layer0_A2.output')
    layer0_A3_output = debug_print(layer0_A3.output_vector_mean,
                                   'layer0_A3.output')
    layer0_A4_output = debug_print(layer0_A4.output_vector_mean,
                                   'layer0_A4.output')

    #before reasoning, do a GRU for doc: d
    U_d, W_d, b_d = create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_d_para = [U_d, W_d, b_d]
    layer_D_GRU = GRU_Matrix_Input(X=layer0_D_output,
                                   word_dim=nkerns[0],
                                   hidden_dim=nkerns[0],
                                   U=U_d,
                                   W=W_d,
                                   b=b_d,
                                   bptt_truncate=-1)

    #Reasoning Layer 1
    repeat_Q = debug_print(
        T.repeat(layer0_Q_output.reshape((layer0_Q_output.shape[0], 1)),
                 maxDocLength,
                 axis=1)[:, :layer_D_GRU.output_matrix.shape[1]], 'repeat_Q')
    input_DNN = debug_print(
        T.concatenate([layer_D_GRU.output_matrix, repeat_Q],
                      axis=0).transpose(),
        'input_DNN')  #each row is an example
    output_DNN1 = HiddenLayer(rng,
                              input=input_DNN,
                              n_in=nkerns[0] * 2,
                              n_out=nkerns[0])
    output_DNN2 = HiddenLayer(rng,
                              input=output_DNN1.output,
                              n_in=nkerns[0],
                              n_out=nkerns[0])

    DNN_out = debug_print(output_DNN2.output.transpose(), 'DNN_out')
    U_p, W_p, b_p = create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_pooling_para = [U_p, W_p, b_p]
    pooling = GRU_Matrix_Input(X=DNN_out,
                               word_dim=nkerns[0],
                               hidden_dim=nkerns[0],
                               U=U_p,
                               W=W_p,
                               b=b_p,
                               bptt_truncate=-1)
    translated_Q1 = debug_print(pooling.output_vector_max, 'translated_Q1')

    #before reasoning, do a GRU for doc: d2
    U_d2, W_d2, b_d2 = create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_d2_para = [U_d2, W_d2, b_d2]
    layer_D2_GRU = GRU_Matrix_Input(X=layer_D_GRU.output_matrix,
                                    word_dim=nkerns[0],
                                    hidden_dim=nkerns[0],
                                    U=U_d2,
                                    W=W_d2,
                                    b=b_d2,
                                    bptt_truncate=-1)
    #Reasoning Layer 2
    repeat_Q1 = debug_print(
        T.repeat(translated_Q1.reshape((translated_Q1.shape[0], 1)),
                 maxDocLength,
                 axis=1)[:, :layer_D2_GRU.output_matrix.shape[1]], 'repeat_Q1')
    input_DNN2 = debug_print(
        T.concatenate([layer_D2_GRU.output_matrix, repeat_Q1],
                      axis=0).transpose(),
        'input_DNN2')  #each row is an example
    output_DNN3 = HiddenLayer(rng,
                              input=input_DNN2,
                              n_in=nkerns[0] * 2,
                              n_out=nkerns[0])
    output_DNN4 = HiddenLayer(rng,
                              input=output_DNN3.output,
                              n_in=nkerns[0],
                              n_out=nkerns[0])

    DNN_out2 = debug_print(output_DNN4.output.transpose(), 'DNN_out2')
    U_p2, W_p2, b_p2 = create_GRU_para(rng, nkerns[0], nkerns[0])
    layer_pooling_para2 = [U_p2, W_p2, b_p2]
    pooling2 = GRU_Matrix_Input(X=DNN_out2,
                                word_dim=nkerns[0],
                                hidden_dim=nkerns[0],
                                U=U_p2,
                                W=W_p2,
                                b=b_p2,
                                bptt_truncate=-1)
    translated_Q2 = debug_print(pooling2.output_vector_max, 'translated_Q2')

    QA1 = T.concatenate([translated_Q2, layer0_A1_output], axis=0)
    QA2 = T.concatenate([translated_Q2, layer0_A2_output], axis=0)
    QA3 = T.concatenate([translated_Q2, layer0_A3_output], axis=0)
    QA4 = T.concatenate([translated_Q2, layer0_A4_output], axis=0)

    W_HL, b_HL = create_HiddenLayer_para(rng, n_in=nkerns[0] * 2, n_out=1)
    match_params = [W_HL, b_HL]
    QA1_match = HiddenLayer(rng,
                            input=QA1,
                            n_in=nkerns[0] * 2,
                            n_out=1,
                            W=W_HL,
                            b=b_HL)
    QA2_match = HiddenLayer(rng,
                            input=QA2,
                            n_in=nkerns[0] * 2,
                            n_out=1,
                            W=W_HL,
                            b=b_HL)
    QA3_match = HiddenLayer(rng,
                            input=QA3,
                            n_in=nkerns[0] * 2,
                            n_out=1,
                            W=W_HL,
                            b=b_HL)
    QA4_match = HiddenLayer(rng,
                            input=QA4,
                            n_in=nkerns[0] * 2,
                            n_out=1,
                            W=W_HL,
                            b=b_HL)

    #     simi_overall_level1=debug_print(cosine(translated_Q2, layer0_A1_output), 'simi_overall_level1')
    #     simi_overall_level2=debug_print(cosine(translated_Q2, layer0_A2_output), 'simi_overall_level2')
    #     simi_overall_level3=debug_print(cosine(translated_Q2, layer0_A3_output), 'simi_overall_level3')
    #     simi_overall_level4=debug_print(cosine(translated_Q2, layer0_A4_output), 'simi_overall_level4')

    simi_overall_level1 = debug_print(QA1_match.output[0],
                                      'simi_overall_level1')
    simi_overall_level2 = debug_print(QA2_match.output[0],
                                      'simi_overall_level2')
    simi_overall_level3 = debug_print(QA3_match.output[0],
                                      'simi_overall_level3')
    simi_overall_level4 = debug_print(QA4_match.output[0],
                                      'simi_overall_level4')

    #     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

    #only use overall_simi
    cost = T.maximum(
        0.0, margin + simi_overall_level2 - simi_overall_level1) + T.maximum(
            0.0,
            margin + simi_overall_level3 - simi_overall_level1) + T.maximum(
                0.0, margin + simi_overall_level4 - simi_overall_level1)

    #     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
    posi_simi = simi_overall_level1
    nega_simi = T.max(
        [simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #     #use ensembled simi
    #     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
    #     posi_simi=simi_1
    #     nega_simi=T.max([simi_2, simi_3, simi_4])

    L2_reg = debug_print(
        (U**2).sum() + (W**2).sum() + (U_p**2).sum() + (W_p**2).sum() +
        (U_p2**2).sum() + (W_p2**2).sum() + (U_d**2).sum() + (W_d**2).sum() +
        (U_d2**2).sum() + (W_d2**2).sum() + (output_DNN1.W**2).sum() +
        (output_DNN2.W**2).sum() + (output_DNN3.W**2).sum() +
        (output_DNN4.W**2).sum() + (W_HL**2).sum(), 'L2_reg'
    )  #+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost = debug_print(cost + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index],
        [cost, posi_simi, nega_simi],
        givens={
            index_D: test_data_D[index],  #a matrix
            index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
            index_A4: test_data_A4[index],
            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
            len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
            len_A4: test_Length_A4[index],
            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
            left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
            left_A4: test_leftPad_A4[index],
            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
            right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index],
            right_A4: test_rightPad_A4[index]
        },
        on_unused_input='ignore')

    params = layer0_para + output_DNN1.params + output_DNN2.params + output_DNN3.params + output_DNN4.params + layer_pooling_para + layer_pooling_para2 + match_params + layer_d_para + layer_d2_para

    #     accumulator=[]
    #     for para_i in params:
    #         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
    #         accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    #     updates = []
    #     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
    #         grad_i=debug_print(grad_i,'grad_i')
    #         acc = decay*acc_i + (1-decay)*T.sqr(grad_i) #rmsprop
    #         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-6)))
    #         updates.append((acc_i, acc))

    def AdaDelta_updates(parameters, gradients, rho, eps):
        # create variables to store intermediate updates
        gradients_sq = [
            theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters
        ]
        deltas_sq = [
            theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters
        ]

        # calculates the new "average" delta for the next iteration
        gradients_sq_new = [
            rho * g_sq + (1 - rho) * (g**2)
            for g_sq, g in zip(gradients_sq, gradients)
        ]

        # calculates the step in direction. The square root is an approximation to getting the RMS for the average value
        deltas = [
            (T.sqrt(d_sq + eps) / T.sqrt(g_sq + eps)) * grad
            for d_sq, g_sq, grad in zip(deltas_sq, gradients_sq_new, gradients)
        ]

        # calculates the new "average" deltas for the next step.
        deltas_sq_new = [
            rho * d_sq + (1 - rho) * (d**2)
            for d_sq, d in zip(deltas_sq, deltas)
        ]

        # Prepare it as a list f
        gradient_sq_updates = zip(gradients_sq, gradients_sq_new)
        deltas_sq_updates = zip(deltas_sq, deltas_sq_new)
        parameters_updates = [(p, p - d) for p, d in zip(parameters, deltas)]
        return gradient_sq_updates + deltas_sq_updates + parameters_updates

    updates = AdaDelta_updates(params, grads, decay, 1e-6)

    train_model = theano.function(
        [index], [cost, posi_simi, nega_simi],
        updates=updates,
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost, posi_simi, nega_simi],
        givens={
            index_D: train_data_D[index],
            index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
            index_A4: train_data_A4[index],
            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
            len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
            len_A4: train_Length_A4[index],
            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
            left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
            left_A4: train_leftPad_A4[index],
            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
            right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index],
            right_A4: train_rightPad_A4[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #         shuffle(train_batch_start)#shuffle training data

        corr_train = 0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1
            sys.stdout.write("Training :[%6f] %% complete!\r" %
                             ((iter % train_size) * 100.0 / train_size))
            sys.stdout.flush()

            minibatch_index = minibatch_index + 1

            cost_average, posi_simi, nega_simi = train_model(batch_start)
            if posi_simi > nega_simi:
                corr_train += 1

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + 'corr rate:' + str(
                            corr_train * 100.0 / train_size)

            if iter % validation_frequency == 0:
                corr_test = 0
                for i in test_batch_start:
                    cost, posi_simi, nega_simi = test_model(i)
                    if posi_simi > nega_simi:
                        corr_test += 1

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc = corr_test * 1.0 / test_size
                #test_acc=1-test_score
                print(
                    ('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') %
                    (epoch, minibatch_index, n_train_batches, test_acc * 100.))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                find_better = False
                if test_acc > max_acc:
                    max_acc = test_acc
                    best_epoch = epoch
                    find_better = True
                print '\t\t\ttest_acc:', test_acc, 'max:', max_acc, '(at', best_epoch, ')'
                if find_better == True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #13
0
def evaluate_lenet5(learning_rate=0.5,
                    n_epochs=2000,
                    batch_size=500,
                    emb_size=300,
                    hidden_size=300,
                    L2_weight=0.0001,
                    para_len_limit=700,
                    q_len_limit=40):

    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train(
        para_len_limit, q_len_limit)
    train_size = len(train_para_list)
    if train_size != len(train_Q_list) or train_size != len(
            train_label_list) or train_size != len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test(
        word2id, para_len_limit, q_len_limit)
    test_size = len(test_para_list)
    if test_size != len(test_Q_list) or test_size != len(
            test_mask) or test_size != len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)

    id2word = {y: x for x, y in overall_word2id.iteritems()}
    word2vec = load_word2vec()

    rand_values = random_value_normal((overall_vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    #     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    labels = T.imatrix('labels')
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')
    extraF = T.ftensor3('extraF')  # should be in shape (batch, wordsize, 3)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    paragraph_input = embeddings[paragraph.flatten()].reshape(
        (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose(
            (0, 2, 1))  # (batch_size, emb_size, maxparalen)
    #
    # #     BdGRU(rng, str(0), shape, X, mask, is_train = 1, batch_size = 1, p = 0.5)
    #
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b]
    paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=paragraph_input,
        Mask=para_mask,
        hidden_dim=hidden_size,
        U=U1,
        W=W1,
        b=b1,
        Ub=U1_b,
        Wb=W1_b,
        bb=b1_b)
    para_reps = paragraph_model.output_tensor  #(batch, emb, para_len)

    Qs_emb = embeddings[questions.flatten()].reshape(
        (questions.shape[0], questions.shape[1], emb_size)).transpose(
            (0, 2, 1))  #(#questions, emb_size, maxsenlength)
    UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size)
    Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b]
    questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=Qs_emb,
        Mask=q_mask,
        hidden_dim=hidden_size,
        U=UQ,
        W=WQ,
        b=bQ,
        Ub=UQ_b,
        Wb=WQ_b,
        bb=bQ_b)
    questions_reps = questions_model.output_sent_rep_maxpooling.reshape(
        (batch_size, 1, hidden_size))  #(batch, 2*out_size)
    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)

    #attention distributions
    W_a1 = create_ensemble_para(
        rng, hidden_size,
        hidden_size)  # init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size + 3)  # 3 extra features

    norm_W_a1 = normalize_matrix(W_a1)
    norm_W_a2 = normalize_matrix(W_a2)
    norm_U_a = normalize_matrix(U_a)

    LR_b = theano.shared(
        value=numpy.zeros((2, ),
                          dtype=theano.config.floatX),  # @UndefinedVariable
        name='LR_b',
        borrow=True)

    attention_paras = [W_a1, W_a2, U_a, LR_b]

    transformed_para_reps = T.tanh(
        T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2))
    transformed_q_reps = T.tanh(T.dot(questions_reps, norm_W_a1))
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)

    add_both = 0.5 * (transformed_para_reps + transformed_q_reps)
    prior_att = T.concatenate([add_both, normalize_matrix(extraF)], axis=2)

    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices = para_mask.flatten().nonzero()[0]

    layer3 = LogisticRegression(rng,
                                input=prior_att.reshape(
                                    (batch_size * prior_att.shape[1],
                                     hidden_size + 3)),
                                n_in=hidden_size + 3,
                                n_out=2,
                                W=norm_U_a,
                                b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.mean(
        T.log(layer3.p_y_given_x)
        [valid_indices,
         labels.flatten()[valid_indices]])  #[T.arange(y.shape[0]), y])

    distributions = layer3.p_y_given_x[:, -1].reshape(
        (batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
    masked_dis = distributions * para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()


    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''

    #     def AttentionLayer(q_rep, ext_M):
    #         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
    #         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
    #        f __name__ == '__main__':
    #         prior_att=T.concatenate([prior_att, ext_M], axis=1)
    #
    #         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
    #         return strength.transpose() #(1, #words)

    #     distributions, updates = theano.scan(
    #     AttentionLayer,
    #     sequences=[questions_reps,extraF] )

    #     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
    #     labels=debug_print(labels, 'labels')
    #     label_mask=T.gt(labels,0.0)
    #     neg_label_mask=T.lt(labels,0.0)
    #     dis_masked=distributions*label_mask
    #     remain_dis_masked=distributions*neg_label_mask
    #     pos_error=((dis_masked-1)**2).mean()
    #     neg_error=((remain_dis_masked-(-1))**2).mean()
    #     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [embeddings] + paragraph_para + Q_para + attention_paras
    L2_reg = L2norm_paraList(
        [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost = error  #+L2_weight*L2_reg

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [paragraph, questions, labels, para_mask, q_mask, extraF],
        error,
        updates=updates,
        on_unused_input='ignore')

    test_model = theano.function(
        [paragraph, questions, para_mask, q_mask, extraF],
        masked_dis,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(
        numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_exact_acc = 0.0
    cost_i = 0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #shuffle(train_batch_start)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            #             haha=para_mask[para_id:para_id+batch_size]
            #             print haha
            #             for i in range(batch_size):
            #                 print len(haha[i])
            cost_i += train_model(
                np.asarray(train_para_list[para_id:para_id + batch_size],
                           dtype='int32'),
                np.asarray(train_Q_list[para_id:para_id + batch_size],
                           dtype='int32'),
                np.asarray(train_label_list[para_id:para_id + batch_size],
                           dtype='int32'),
                np.asarray(train_para_mask[para_id:para_id + batch_size],
                           dtype=theano.config.floatX),
                np.asarray(train_mask[para_id:para_id + batch_size],
                           dtype=theano.config.floatX),
                np.asarray(train_feature_matrixlist[para_id:para_id +
                                                    batch_size],
                           dtype=theano.config.floatX))

            #print iter
            if iter % 10 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()

                exact_match = 0.0
                q_amount = 0
                for test_para_id in test_batch_start:
                    distribution_matrix = test_model(
                        np.asarray(test_para_list[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype='int32'),
                        np.asarray(test_Q_list[test_para_id:test_para_id +
                                               batch_size],
                                   dtype='int32'),
                        np.asarray(test_para_mask[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(test_mask[test_para_id:test_para_id +
                                             batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(
                            test_feature_matrixlist[test_para_id:test_para_id +
                                                    batch_size],
                            dtype=theano.config.floatX))

                    #                     print distribution_matrix
                    test_para_wordlist_list = test_text_list[
                        test_para_id:test_para_id + batch_size]
                    para_gold_ansset_list = q_ansSet_list[
                        test_para_id:test_para_id + batch_size]
                    paralist_extra_features = test_feature_matrixlist[
                        test_para_id:test_para_id + batch_size]
                    sub_para_mask = test_para_mask[test_para_id:test_para_id +
                                                   batch_size]
                    para_len = len(test_para_wordlist_list[0])
                    if para_len != len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(
                            distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount += batch_size
                    #                     print q_size
                    #                     print test_para_word_list
                    for q in range(batch_size):  #for each question
                        #                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
                        #                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
                        #                         else:
                        #                             ss=len(distribution_matrix[q])
                        #                             combine_list=[]
                        #                             for ii in range(ss):
                        #                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
                        #                             print combine_list
                        #                         exit(0)
                        #                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans = extract_ansList_attentionList(
                            test_para_wordlist_list[q], distribution_matrix[q],
                            np.asarray(paralist_extra_features[q],
                                       dtype=theano.config.floatX),
                            sub_para_mask[q])
                        q_gold_ans_set = para_gold_ansset_list[q]

                        F1 = MacroF1(pred_ans, q_gold_ans_set)
                        exact_match += F1


#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                exact_acc = exact_match / q_amount
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc
                print 'current average F1:', exact_acc, '\t\tmax F1:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #14
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=10000, emb_size=50,
                    margin=0.3, L2_weight=1e-10, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=16450007, neg_size=60, test_neg_size=300,
                    comment=''):#L1Distance_
    model_options = locals().copy()
    print "model options", model_options
    triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/freebase-subsets/'
    rng = numpy.random.RandomState(1234)
#     triples, entity_size, relation_size, entity_count, relation_count=load_triples(triple_path+'freebase_mtr100_mte100-train.txt', line_no, triple_path)#vocab_size contain train, dev and test
    triples, entity_size, relation_size, train_triples_set, train_entity_set, train_relation_set,statistics=load_Train(triple_path+'freebase-FB5M2M-combined.txt', line_no, triple_path)
    train_h2t=statistics[0]
    train_t2h=statistics[1]
    train_r2t=statistics[2]
    train_r2h=statistics[3]
    train_r_replace_tail_prop=statistics[4]
    
    print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count)

       


    rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    entity_E=theano.shared(value=rand_values, borrow=True)      
    rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
    relation_E=theano.shared(value=rand_values, borrow=True)    
    
    GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U1, GRU_W1, GRU_b1=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U2, GRU_W2, GRU_b2=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=3) 
#     para_to_load=[entity_E, relation_E, GRU_U, GRU_W, GRU_b]
#     load_model_from_file(triple_path+'Best_Paras_dim'+str(emb_size), para_to_load)  #+'_hits10_63.616'
#     GRU_U_combine=[GRU_U0, GRU_U1, GRU_U2]
#     GRU_W_combine=[GRU_W0, GRU_W1, GRU_W2]
#     GRU_b_combine=[GRU_b0, GRU_b1, GRU_b2]

#     w2v_entity_rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#    
#     w2v_relation_rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
# 
#     w2v_entity_rand_values=load_word2vec_to_init(w2v_entity_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_entityEmb50.txt')
#     w2v_relation_rand_values=load_word2vec_to_init(w2v_relation_rand_values, triple_path+'freebase_mtr100_mte100-train.txt_ids_relationEmb50.txt')
#     w2v_entity_rand_values=theano.shared(value=w2v_entity_rand_values, borrow=True)       
#     w2v_relation_rand_values=theano.shared(value=w2v_relation_rand_values, borrow=True)  
      
#     entity_E_ensemble=entity_E+norm_matrix(w2v_entity_rand_values)
#     relation_E_ensemble=relation_E+norm_matrix(w2v_relation_rand_values)
    
    norm_entity_E=norm_matrix(entity_E)
    norm_relation_E=norm_matrix(relation_E)
    
        
    n_batchs=line_no/batch_size
    remain_triples=line_no%batch_size
    if remain_triples>0:
        batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size]
    else:
        batch_start=list(numpy.arange(n_batchs)*batch_size)

#     batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True)
#     batch_start=T.cast(batch_start, 'int64')   
    
    # allocate symbolic variables for the data
#     index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    n_index_T = T.ltensor3('n_index_T')
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    
    dist_tail=one_batch_parallel_Ramesh(x_index_l, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size)
    
    
    loss__tail_is=one_neg_batches_parallel_Ramesh(n_index_T, norm_entity_E, norm_relation_E, GRU_U, GRU_W, GRU_b, emb_size)
    loss_tail_i=T.maximum(0.0, margin+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is) 
#     loss_relation_i=T.maximum(0.0, margin+dist_relation.reshape((dist_relation.shape[0],1))-loss_relation_is)     
#     loss_head_i=T.maximum(0.0, margin+dist_head.reshape((dist_head.shape[0],1))-loss_head_is)    
    
    
#     loss_tail_i_test=T.maximum(0.0, 0.0+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is)   
#     binary_matrix_test=T.gt(loss_tail_i_test, 0)
#     sum_vector_test=T.sum(binary_matrix_test, axis=1)
#     binary_vector_hits10=T.gt(sum_vector_test, 10)
#     test_loss=T.sum(binary_vector_hits10)*1.0/batch_size  
#     loss_relation_i=T.maximum(0.0, margin+dis_relation.reshape((dis_relation.shape[0],1))-loss__relation_is) 
#     loss_head_i=T.maximum(0.0, margin+dis_head.reshape((dis_head.shape[0],1))-loss__head_is)     
#     def neg_slice(neg_matrix):
#         dist_tail_slice, dis_relation_slice, dis_head_slice=one_batch_parallel_Ramesh(neg_matrix, entity_E, relation_E, GRU_U_combine, GRU_W_combine, GRU_b_combine, emb_size)
#         loss_tail_i=T.maximum(0.0, margin+dist_tail-dist_tail_slice) 
#         loss_relation_i=T.maximum(0.0, margin+dis_relation-dis_relation_slice) 
#         loss_head_i=T.maximum(0.0, margin+dis_head-dis_head_slice) 
#         return loss_tail_i, loss_relation_i, loss_head_i
#     
#     (loss__tail_is, loss__relation_is, loss__head_is), updates = theano.scan(
#        neg_slice,
#        sequences=n_index_T,
#        outputs_info=None)  
    
    loss_tails=T.mean(T.sum(loss_tail_i, axis=1) )
#     loss_relations=T.mean(T.sum(loss_relation_i, axis=1) )
#     loss_heads=T.mean(T.sum(loss_head_i, axis=1) )
    loss=loss_tails#+loss_relations+loss_heads
    L2_loss=debug_print((entity_E** 2).sum()+(relation_E** 2).sum()\
                      +(GRU_U** 2).sum()+(GRU_W** 2).sum(), 'L2_reg')
#     Div_loss=Diversify_Reg(GRU_U[0])+Diversify_Reg(GRU_U[1])+Diversify_Reg(GRU_U[2])+\
#         Diversify_Reg(GRU_W[0])+Diversify_Reg(GRU_W[1])+Diversify_Reg(GRU_W[2])
    cost=loss+L2_weight*L2_loss#+div_reg*Div_loss
    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [entity_E, relation_E, GRU_U, GRU_W, GRU_b]
#     params_conv = [conv_W, conv_b]
    params_to_store=[entity_E, relation_E, GRU_U, GRU_W, GRU_b]
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
    grads = T.grad(cost, params)
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-9)))   #AdaGrad
        updates.append((acc_i, acc))    

   

#     grads = T.grad(cost, params)
#     updates = []
#     for param_i, grad_i in zip(params, grads):
#         updates.append((param_i, param_i - learning_rate * grad_i))   #AdaGrad 
        
    train_model = theano.function([x_index_l, n_index_T], [loss, cost], updates=updates,on_unused_input='ignore')
#     test_model = theano.function([x_index_l, n_index_T], test_loss, on_unused_input='ignore')
# 
#     train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
#           givens={
#             x_index_l: indices_train_l[index: index + batch_size],
#             x_index_r: indices_train_r[index: index + batch_size],
#             y: trainY[index: index + batch_size],
#             left_l: trainLeftPad_l[index],
#             right_l: trainRightPad_l[index],
#             left_r: trainLeftPad_r[index],
#             right_r: trainRightPad_r[index],
#             length_l: trainLengths_l[index],
#             length_r: trainLengths_r[index],
#             norm_length_l: normalized_train_length_l[index],
#             norm_length_r: normalized_train_length_r[index],
#             mts: mt_train[index: index + batch_size],
#             wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
#     validation_frequency = min(n_train_batches/5, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0
#     corpus_triples_set=train_triples_set|dev_triples_set|test_triples_set
    best_train_loss=1000000
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
#         learning_rate/=epoch
#         print 'lr:', learning_rate
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        loss_sum=0.0
        for start in batch_start:
            if start%100000==0:
                print start, '...'
            pos_triples=triples[start:start+batch_size]
            all_negs=[]
#             count=0
            for pos_triple in pos_triples:
                neg_triples=get_n_neg_triples_train(pos_triple, train_triples_set, train_entity_set, train_r_replace_tail_prop, neg_size)
# #                 print 'neg_head_triples'
#                 neg_relation_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 1, neg_size/3)
# #                 print 'neg_relation_triples'
#                 neg_tail_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 2, neg_size/3)
#                 print 'neg_tail_triples'
                all_negs.append(neg_triples)
#                 print 'neg..', count
#                 count+=1
            neg_tensor=numpy.asarray(all_negs).reshape((batch_size, neg_size, 3)).transpose(1,0,2)
            loss, cost= train_model(pos_triples, neg_tensor)
            loss_sum+=loss
        loss_sum/=len(batch_start)
        print 'Training loss:', loss_sum, 'cost:', cost
        
        
#         loss_test=0.0
# 
#         for test_start in batch_start_test:
#             pos_triples=test_triples[test_start:test_start+batch_size]
#             all_negs=[]
#             for pos_triple in pos_triples:
#                 neg_triples=get_n_neg_triples_new(pos_triple, corpus_triples_set, test_entity_set, test_relation_set, test_neg_size/2, True)
#                 all_negs.append(neg_triples)
#                 
#             neg_tensor=numpy.asarray(all_negs).reshape((batch_size, test_neg_size, 3)).transpose(1,0,2)
#             loss_test+= test_model(pos_triples, neg_tensor)
#             
#             
#         loss_test/=n_batchs_test
#         print '\t\t\tUpdating epoch', epoch, 'finished! Test hits10:', 1.0-loss_test
        if loss_sum< best_train_loss:
            store_model_to_file(triple_path+comment+'Best_Paras_dim'+str(emb_size), params_to_store)
#             store_model_to_file(triple_path+'Divreg_Best_Paras_dim'+str(emb_size), params_to_store)
            best_train_loss=loss_sum
            print 'Finished storing best  params'
#             exit(0)
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #15
0
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, nkerns=[90,90], batch_size=1, window_width=2,
                    maxSentLength=64, maxDocLength=60, emb_size=50, hidden_size=200,
                    L2_weight=0.0065, update_freq=1, norm_threshold=5.0, max_s_length=57, max_d_length=59, margin=1.0):
    maxSentLength=max_s_length+2*(window_width-1)
    maxDocLength=max_d_length+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/MCTest/';
    rng = numpy.random.RandomState(23455)
    train_data,train_size, test_data, test_size, vocab_size=load_MCTest_corpus_DPNQ(rootPath+'vocab_DPNQ.txt', rootPath+'mc500.train.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', rootPath+'mc500.test.tsv_standardlized.txt_with_state.txt_DSSSS.txt_DPN.txt_DPNQ.txt', max_s_length,maxSentLength, maxDocLength)#vocab_size contain train, dev and test

    #datasets_nonoverlap, vocab_size_nonoverlap=load_SICK_corpus(rootPath+'vocab_nonoverlap_train_plus_dev.txt', rootPath+'train_plus_dev_removed_overlap_as_training.txt', rootPath+'test_removed_overlap_as_training.txt', max_truncate_nonoverlap,maxSentLength_nonoverlap, entailment=True)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    #mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
#     mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
#     extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
#     discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

# results=[numpy.array(data_D), numpy.array(data_Q), numpy.array(data_A1), numpy.array(data_A2), numpy.array(data_A3), numpy.array(data_A4), numpy.array(Label),
#          numpy.array(Length_D),numpy.array(Length_D_s), numpy.array(Length_Q), numpy.array(Length_A1), numpy.array(Length_A2), numpy.array(Length_A3), numpy.array(Length_A4),
#         numpy.array(leftPad_D),numpy.array(leftPad_D_s), numpy.array(leftPad_Q), numpy.array(leftPad_A1), numpy.array(leftPad_A2), numpy.array(leftPad_A3), numpy.array(leftPad_A4),
#         numpy.array(rightPad_D),numpy.array(rightPad_D_s), numpy.array(rightPad_Q), numpy.array(rightPad_A1), numpy.array(rightPad_A2), numpy.array(rightPad_A3), numpy.array(rightPad_A4)]
# return results, line_control
    [train_data_D, train_data_A1, train_data_A2, train_data_A3, train_Label,
                 train_Length_D,train_Length_D_s, train_Length_A1, train_Length_A2, train_Length_A3,
                train_leftPad_D,train_leftPad_D_s, train_leftPad_A1, train_leftPad_A2, train_leftPad_A3,
                train_rightPad_D,train_rightPad_D_s, train_rightPad_A1, train_rightPad_A2, train_rightPad_A3]=train_data
    [test_data_D, test_data_A1, test_data_A2, test_data_A3, test_Label,
                 test_Length_D,test_Length_D_s, test_Length_A1, test_Length_A2, test_Length_A3,
                test_leftPad_D,test_leftPad_D_s, test_leftPad_A1, test_leftPad_A2, test_leftPad_A3,
                test_rightPad_D,test_rightPad_D_s, test_rightPad_A1, test_rightPad_A2, test_rightPad_A3]=test_data


    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size

    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)


#     indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
#     indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
#     indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
#     indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
#     indices_train_l=T.cast(indices_train_l, 'int64')
#     indices_train_r=T.cast(indices_train_r, 'int64')
#     indices_test_l=T.cast(indices_test_l, 'int64')
#     indices_test_r=T.cast(indices_test_r, 'int64')



    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_DPNQ_glove_50d.txt')
    #rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum=0

    # allocate symbolic variables for the data
    index = T.lscalar()
    index_D = T.lmatrix()   # now, x is the index matrix, must be integer
#     index_Q = T.lvector()
    index_A1= T.lvector()
    index_A2= T.lvector()
    index_A3= T.lvector()
#     index_A4= T.lvector()
#     y = T.lvector()

    len_D=T.lscalar()
    len_D_s=T.lvector()
#     len_Q=T.lscalar()
    len_A1=T.lscalar()
    len_A2=T.lscalar()
    len_A3=T.lscalar()
#     len_A4=T.lscalar()

    left_D=T.lscalar()
    left_D_s=T.lvector()
#     left_Q=T.lscalar()
    left_A1=T.lscalar()
    left_A2=T.lscalar()
    left_A3=T.lscalar()
#     left_A4=T.lscalar()

    right_D=T.lscalar()
    right_D_s=T.lvector()
#     right_Q=T.lscalar()
    right_A1=T.lscalar()
    right_A2=T.lscalar()
    right_A3=T.lscalar()
#     right_A4=T.lscalar()



    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # sentence shape
    dshape = (nkerns[0], maxDocLength) # doc shape
    filter_words=(emb_size,window_width)
    filter_sents=(nkerns[0], window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_D_input = embeddings[index_D.flatten()].reshape((maxDocLength,maxSentLength, emb_size)).transpose(0, 2, 1)#.dimshuffle(0, 'x', 1, 2)
#     layer0_Q_input = embeddings[index_Q.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    layer0_A1_input = embeddings[index_A1.flatten()].reshape((maxSentLength, emb_size)).transpose()
    layer0_A2_input = embeddings[index_A2.flatten()].reshape((maxSentLength, emb_size)).transpose()
    layer0_A3_input = embeddings[index_A3.flatten()].reshape((maxSentLength, emb_size)).transpose()
#     layer0_A4_input = embeddings[index_A4.flatten()].reshape((batch_size,maxSentLength, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)


    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b]
#     conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
#     layer2_para=[conv2_W, conv2_b]
#     high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1])
#     highW_para=[high_W, high_b]

    #load_model(params)


    layer0_D = GRU_Tensor3_Input(T=layer0_D_input[left_D:-right_D,:,:],
                                 lefts=left_D_s[left_D:-right_D],
                                 rights=right_D_s[left_D:-right_D],
                                 hidden_dim=nkerns[0],
                                 U=U,W=W,b=b)
#     layer0_Q = GRU_Matrix_Input(X=layer0_Q_input[:,left_Q:-right_Q], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A1 = GRU_Matrix_Input(X=layer0_A1_input[:,left_A1:-right_A1], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A2 = GRU_Matrix_Input(X=layer0_A2_input[:,left_A2:-right_A2], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A3 = GRU_Matrix_Input(X=layer0_A3_input[:,left_A3:-right_A3], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
#     layer0_A4 = GRU_Matrix_Input(X=layer0_A4_input[:,left_A4:-right_A4], word_dim=emb_size, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)


    layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output')
#     layer0_Q_output=debug_print(layer0_Q.output_vector_mean, 'layer0_Q.output')
    layer0_A1_output=debug_print(layer0_A1.output_vector_mean, 'layer0_A1.output')
    layer0_A2_output=debug_print(layer0_A2.output_vector_mean, 'layer0_A2.output')
    layer0_A3_output=debug_print(layer0_A3.output_vector_mean, 'layer0_A3.output')
#     layer0_A4_output=debug_print(layer0_A4.output_vector_mean, 'layer0_A4.output')



#
#
#     conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]))
#     layer0_para=[conv_W, conv_b]
    conv2_W, conv2_b=create_conv_para(rng, filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]))
    layer2_para=[conv2_W, conv2_b]
    high_W, high_b=create_highw_para(rng, nkerns[0], nkerns[1]) # this part decides nkern[0] and nkern[1] must be in the same dimension
    highW_para=[high_W, high_b]
    params = layer2_para+layer0_para+highW_para#+[embeddings]
#     #load_model(params)
#
#     layer0_D = Conv_with_input_para(rng, input=layer0_D_input,
#             image_shape=(maxDocLength, 1, ishape[0], ishape[1]),
#             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
# #     layer0_Q = Conv_with_input_para(rng, input=layer0_Q_input,
# #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
# #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
#     layer0_A1 = Conv_with_input_para(rng, input=layer0_A1_input,
#             image_shape=(batch_size, 1, ishape[0], ishape[1]),
#             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
#     layer0_A2 = Conv_with_input_para(rng, input=layer0_A2_input,
#             image_shape=(batch_size, 1, ishape[0], ishape[1]),
#             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
#     layer0_A3 = Conv_with_input_para(rng, input=layer0_A3_input,
#             image_shape=(batch_size, 1, ishape[0], ishape[1]),
#             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
# #     layer0_A4 = Conv_with_input_para(rng, input=layer0_A4_input,
# #             image_shape=(batch_size, 1, ishape[0], ishape[1]),
# #             filter_shape=(nkerns[0], 1, filter_words[0], filter_words[1]), W=conv_W, b=conv_b)
#
#     layer0_D_output=debug_print(layer0_D.output, 'layer0_D.output')
# #     layer0_Q_output=debug_print(layer0_Q.output, 'layer0_Q.output')
#     layer0_A1_output=debug_print(layer0_A1.output, 'layer0_A1.output')
#     layer0_A2_output=debug_print(layer0_A2.output, 'layer0_A2.output')
#     layer0_A3_output=debug_print(layer0_A3.output, 'layer0_A3.output')
# #     layer0_A4_output=debug_print(layer0_A4.output, 'layer0_A4.output')


#     layer1_DQ=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_Q_output, kern=nkerns[0],
#                                       left_D=left_D, right_D=right_D,
#                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_Q, right_r=right_Q,
#                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_Q+filter_words[1]-1,
#                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
#    def __init__(self, rng, input_D, input_r, kern, left_D, right_D, dim, doc_len, topk): # length_l, length_r: valid lengths after conv
    layer1_DA1=GRU_Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A1_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA2=GRU_Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A2_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
    layer1_DA3=GRU_Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A3_output, kern=nkerns[0],
                                      left_D=left_D, right_D=right_D,
                       dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)
#     layer1_DA4=Average_Pooling_Scan(rng, input_D=layer0_D_output, input_r=layer0_A4_output, kern=nkerns[0],
#                                       left_D=left_D, right_D=right_D,
#                      left_D_s=left_D_s, right_D_s=right_D_s, left_r=left_A4, right_r=right_A4,
#                       length_D_s=len_D_s+filter_words[1]-1, length_r=len_A4+filter_words[1]-1,
#                        dim=maxSentLength+filter_words[1]-1, doc_len=maxDocLength, topk=3)


    #load_model_for_conv2([conv2_W, conv2_b])#this can not be used, as the nkerns[0]!=filter_size[0]
    #conv from sentence to doc
#     layer2_DQ = Conv_with_input_para(rng, input=layer1_DQ.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
#             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA1 = Conv_with_input_para(rng, input=layer1_DA1.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA2 = Conv_with_input_para(rng, input=layer1_DA2.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_DA3 = Conv_with_input_para(rng, input=layer1_DA3.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
            image_shape=(batch_size, 1, nkerns[0], dshape[1]),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
#     layer2_DA4 = Conv_with_input_para(rng, input=layer1_DA4.output_D.reshape((batch_size, 1, nkerns[0], dshape[1])),
#             image_shape=(batch_size, 1, nkerns[0], dshape[1]),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    #conv single Q and A into doc level with same conv weights
#     layer2_Q = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DQ.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
#             image_shape=(batch_size, 1, nkerns[0], 1),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A1 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA1.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A2 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA2.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
    layer2_A3 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA3.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
            image_shape=(batch_size, 1, nkerns[0], 1),
            filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
#     layer2_A4 = Conv_with_input_para_one_col_featuremap(rng, input=layer1_DA4.output_QA_sent_level_rep.reshape((batch_size, 1, nkerns[0], 1)),
#             image_shape=(batch_size, 1, nkerns[0], 1),
#             filter_shape=(nkerns[1], 1, nkerns[0], filter_sents[1]), W=conv2_W, b=conv2_b)
#     layer2_Q_output_sent_rep_Dlevel=debug_print(layer2_Q.output_sent_rep_Dlevel, 'layer2_Q.output_sent_rep_Dlevel')
    layer2_A1_output_sent_rep_Dlevel=debug_print(layer2_A1.output_sent_rep_Dlevel, 'layer2_A1.output_sent_rep_Dlevel')
    layer2_A2_output_sent_rep_Dlevel=debug_print(layer2_A2.output_sent_rep_Dlevel, 'layer2_A2.output_sent_rep_Dlevel')
    layer2_A3_output_sent_rep_Dlevel=debug_print(layer2_A3.output_sent_rep_Dlevel, 'layer2_A3.output_sent_rep_Dlevel')
#     layer2_A4_output_sent_rep_Dlevel=debug_print(layer2_A4.output_sent_rep_Dlevel, 'layer2_A4.output_sent_rep_Dlevel')


#     layer3_DQ=Average_Pooling_for_Top(rng, input_l=layer2_DQ.output, input_r=layer2_Q_output_sent_rep_Dlevel, kern=nkerns[1],
#                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
#                       length_l=len_D+filter_sents[1]-1, length_r=1,
#                        dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA1=Average_Pooling_for_Top(rng, input_l=layer2_DA1.output, input_r=layer2_A1_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0,
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA2=Average_Pooling_for_Top(rng, input_l=layer2_DA2.output, input_r=layer2_A2_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0,
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
    layer3_DA3=Average_Pooling_for_Top(rng, input_l=layer2_DA3.output, input_r=layer2_A3_output_sent_rep_Dlevel, kern=nkerns[1],
                     left_l=left_D, right_l=right_D, left_r=0, right_r=0,
                      length_l=len_D+filter_sents[1]-1, length_r=1,
                       dim=maxDocLength+filter_sents[1]-1, topk=3)
#     layer3_DA4=Average_Pooling_for_Top(rng, input_l=layer2_DA4.output, input_r=layer2_A4_output_sent_rep_Dlevel, kern=nkerns[1],
#                      left_l=left_D, right_l=right_D, left_r=0, right_r=0,
#                       length_l=len_D+filter_sents[1]-1, length_r=1,
#                        dim=maxDocLength+filter_sents[1]-1, topk=3)

    #high-way

#     transform_gate_DQ=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_D_sent_level_rep) + high_b), 'transform_gate_DQ')
    transform_gate_DA1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_D_sent_level_rep) + high_b), 'transform_gate_DA1')
    transform_gate_DA2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_D_sent_level_rep) + high_b), 'transform_gate_DA2')
    transform_gate_DA3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_D_sent_level_rep) + high_b), 'transform_gate_DA3')
#     transform_gate_DA4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_D_sent_level_rep) + high_b), 'transform_gate_DA4')
#     transform_gate_Q=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DQ.output_QA_sent_level_rep) + high_b), 'transform_gate_Q')
    transform_gate_A1=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA1.output_QA_sent_level_rep) + high_b), 'transform_gate_A1')
    transform_gate_A2=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA2.output_QA_sent_level_rep) + high_b), 'transform_gate_A2')
#     transform_gate_A3=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA3.output_QA_sent_level_rep) + high_b), 'transform_gate_A3')
#     transform_gate_A4=debug_print(T.nnet.sigmoid(T.dot(high_W, layer1_DA4.output_QA_sent_level_rep) + high_b), 'transform_gate_A4')


#     overall_D_Q=debug_print((1.0-transform_gate_DQ)*layer1_DQ.output_D_sent_level_rep+transform_gate_DQ*layer3_DQ.output_D_doc_level_rep, 'overall_D_Q')
    overall_D_A1=(1.0-transform_gate_DA1)*layer1_DA1.output_D_sent_level_rep+transform_gate_DA1*layer3_DA1.output_D_doc_level_rep
    overall_D_A2=(1.0-transform_gate_DA2)*layer1_DA2.output_D_sent_level_rep+transform_gate_DA2*layer3_DA2.output_D_doc_level_rep
    overall_D_A3=(1.0-transform_gate_DA3)*layer1_DA3.output_D_sent_level_rep+transform_gate_DA3*layer3_DA3.output_D_doc_level_rep
#     overall_D_A4=(1.0-transform_gate_DA4)*layer1_DA4.output_D_sent_level_rep+transform_gate_DA4*layer3_DA4.output_D_doc_level_rep

#     overall_Q=(1.0-transform_gate_Q)*layer1_DQ.output_QA_sent_level_rep+transform_gate_Q*layer2_Q.output_sent_rep_Dlevel
    overall_A1=(1.0-transform_gate_A1)*layer1_DA1.output_QA_sent_level_rep+transform_gate_A1*layer2_A1.output_sent_rep_Dlevel
    overall_A2=(1.0-transform_gate_A2)*layer1_DA2.output_QA_sent_level_rep+transform_gate_A2*layer2_A2.output_sent_rep_Dlevel
#     overall_A3=(1.0-transform_gate_A3)*layer1_DA3.output_QA_sent_level_rep+transform_gate_A3*layer2_A3.output_sent_rep_Dlevel
#     overall_A4=(1.0-transform_gate_A4)*layer1_DA4.output_QA_sent_level_rep+transform_gate_A4*layer2_A4.output_sent_rep_Dlevel

    simi_sent_level1=debug_print(cosine(layer1_DA1.output_D_sent_level_rep, layer1_DA1.output_QA_sent_level_rep), 'simi_sent_level1')
    simi_sent_level2=debug_print(cosine(layer1_DA2.output_D_sent_level_rep, layer1_DA2.output_QA_sent_level_rep), 'simi_sent_level2')
#     simi_sent_level3=debug_print(cosine(layer1_DA3.output_D_sent_level_rep, layer1_DA3.output_QA_sent_level_rep), 'simi_sent_level3')
#     simi_sent_level4=debug_print(cosine(layer1_DA4.output_D_sent_level_rep, layer1_DA4.output_QA_sent_level_rep), 'simi_sent_level4')


    simi_doc_level1=debug_print(cosine(layer3_DA1.output_D_doc_level_rep, layer2_A1.output_sent_rep_Dlevel), 'simi_doc_level1')
    simi_doc_level2=debug_print(cosine(layer3_DA2.output_D_doc_level_rep, layer2_A2.output_sent_rep_Dlevel), 'simi_doc_level2')
#     simi_doc_level3=debug_print(cosine(layer3_DA3.output_D_doc_level_rep, layer2_A3.output_sent_rep_Dlevel), 'simi_doc_level3')
#     simi_doc_level4=debug_print(cosine(layer3_DA4.output_D_doc_level_rep, layer2_A4.output_sent_rep_Dlevel), 'simi_doc_level4')


    simi_overall_level1=debug_print(cosine(overall_D_A1, overall_A1), 'simi_overall_level1')
    simi_overall_level2=debug_print(cosine(overall_D_A2, overall_A2), 'simi_overall_level2')
#     simi_overall_level3=debug_print(cosine(overall_D_A3, overall_A3), 'simi_overall_level3')
#     simi_overall_level4=debug_print(cosine(overall_D_A4, overall_A4), 'simi_overall_level4')

#     simi_1=simi_overall_level1+simi_sent_level1+simi_doc_level1
#     simi_2=simi_overall_level2+simi_sent_level2+simi_doc_level2

    simi_1=(simi_overall_level1+simi_sent_level1+simi_doc_level1)/3.0
    simi_2=(simi_overall_level2+simi_sent_level2+simi_doc_level2)/3.0
#     simi_3=(simi_overall_level3+simi_sent_level3+simi_doc_level3)/3.0
#     simi_4=(simi_overall_level4+simi_sent_level4+simi_doc_level4)/3.0



#     eucli_1=1.0/(1.0+EUCLID(layer3_DQ.output_D+layer3_DA.output_D, layer3_DQ.output_QA+layer3_DA.output_QA))

#     #only use overall_simi
#     cost=T.maximum(0.0, margin+T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])-simi_overall_level1) # ranking loss: max(0, margin-nega+posi)
#     posi_simi=simi_overall_level1
#     nega_simi=T.max([simi_overall_level2, simi_overall_level3, simi_overall_level4])
    #use ensembled simi
#     cost=T.maximum(0.0, margin+T.max([simi_2, simi_3, simi_4])-simi_1) # ranking loss: max(0, margin-nega+posi)
#     cost=T.maximum(0.0, margin+simi_2-simi_1)
    simi_PQ=cosine(layer1_DA1.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep)
    simi_NQ=cosine(layer1_DA2.output_QA_sent_level_rep, layer1_DA3.output_D_sent_level_rep)
    #bad matching at overall level
#     simi_PQ=cosine(overall_A1, overall_D_A3)
#     simi_NQ=cosine(overall_A2, overall_D_A3)
    match_cost=T.maximum(0.0, margin+simi_NQ-simi_PQ)
    cost=T.maximum(0.0, margin+simi_sent_level2-simi_sent_level1)+T.maximum(0.0, margin+simi_doc_level2-simi_doc_level1)+T.maximum(0.0, margin+simi_overall_level2-simi_overall_level1)
    cost=cost+match_cost
#     posi_simi=simi_1
#     nega_simi=simi_2



    L2_reg =debug_print((high_W**2).sum()+3*(conv2_W**2).sum()+(U**2).sum()+(W**2).sum(), 'L2_reg')#+(embeddings**2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost=debug_print(cost+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')




    test_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2],
          givens={
            index_D: test_data_D[index], #a matrix
#             index_Q: test_data_Q[index],
            index_A1: test_data_A1[index],
            index_A2: test_data_A2[index],
            index_A3: test_data_A3[index],
#             index_A4: test_data_A4[index],

            len_D: test_Length_D[index],
            len_D_s: test_Length_D_s[index],
#             len_Q: test_Length_Q[index],
            len_A1: test_Length_A1[index],
            len_A2: test_Length_A2[index],
            len_A3: test_Length_A3[index],
#             len_A4: test_Length_A4[index],

            left_D: test_leftPad_D[index],
            left_D_s: test_leftPad_D_s[index],
#             left_Q: test_leftPad_Q[index],
            left_A1: test_leftPad_A1[index],
            left_A2: test_leftPad_A2[index],
            left_A3: test_leftPad_A3[index],
#             left_A4: test_leftPad_A4[index],

            right_D: test_rightPad_D[index],
            right_D_s: test_rightPad_D_s[index],
#             right_Q: test_rightPad_Q[index],
            right_A1: test_rightPad_A1[index],
            right_A2: test_rightPad_A2[index],
            right_A3: test_rightPad_A3[index]
#             right_A4: test_rightPad_A4[index]

            }, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]


    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
        updates.append((acc_i, acc))

#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         acc = acc_i + T.sqr(grad_i)
#         if param_i == embeddings:
#             updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc))[0], theano.shared(numpy.zeros(emb_size)))))   #AdaGrad
#         else:
#             updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))

    train_model = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2], updates=updates,
          givens={
            index_D: train_data_D[index],
#             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
#             index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
#             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
#             len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
#             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
#             left_A4: train_leftPad_A4[index],

            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
#             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index]
#             right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2],
          givens={
            index_D: train_data_D[index],
#             index_Q: train_data_Q[index],
            index_A1: train_data_A1[index],
            index_A2: train_data_A2[index],
            index_A3: train_data_A3[index],
#             index_A4: train_data_A4[index],

            len_D: train_Length_D[index],
            len_D_s: train_Length_D_s[index],
#             len_Q: train_Length_Q[index],
            len_A1: train_Length_A1[index],
            len_A2: train_Length_A2[index],
            len_A3: train_Length_A3[index],
#             len_A4: train_Length_A4[index],

            left_D: train_leftPad_D[index],
            left_D_s: train_leftPad_D_s[index],
#             left_Q: train_leftPad_Q[index],
            left_A1: train_leftPad_A1[index],
            left_A2: train_leftPad_A2[index],
            left_A3: train_leftPad_A3[index],
#             left_A4: train_leftPad_A4[index],

            right_D: train_rightPad_D[index],
            right_D_s: train_rightPad_D_s[index],
#             right_Q: train_rightPad_Q[index],
            right_A1: train_rightPad_A1[index],
            right_A2: train_rightPad_A2[index],
            right_A3: train_rightPad_A3[index]
#             right_A4: train_rightPad_A4[index]
            }, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    max_acc=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        shuffle(train_batch_start)#shuffle training data


        posi_train_sent=[]
        nega_train_sent=[]
        posi_train_doc=[]
        nega_train_doc=[]
        posi_train_overall=[]
        nega_train_overall=[]
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
            sys.stdout.write( "Training :[%6f] %% complete!\r" % ((iter%train_size)*100.0/train_size) )
            sys.stdout.flush()
            minibatch_index=minibatch_index+1

            cost_average, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2= train_model(batch_start)
            posi_train_sent.append(simi_sent_level1)
            nega_train_sent.append(simi_sent_level2)
            posi_train_doc.append(simi_doc_level1)
            nega_train_doc.append(simi_doc_level2)
            posi_train_overall.append(simi_overall_level1)
            nega_train_overall.append(simi_overall_level2)
            if iter % n_train_batches == 0:
                corr_train_sent=compute_corr(posi_train_sent, nega_train_sent)
                corr_train_doc=compute_corr(posi_train_doc, nega_train_doc)
                corr_train_overall=compute_corr(posi_train_overall, nega_train_overall)
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+'corr rate:'+str(corr_train_sent*300.0/train_size)+' '+str(corr_train_doc*300.0/train_size)+' '+str(corr_train_overall*300.0/train_size)


            if iter % validation_frequency == 0:
                posi_test_sent=[]
                nega_test_sent=[]
                posi_test_doc=[]
                nega_test_doc=[]
                posi_test_overall=[]
                nega_test_overall=[]
                for i in test_batch_start:
                    cost, simi_sent_level1, simi_sent_level2, simi_doc_level1, simi_doc_level2, simi_overall_level1, simi_overall_level2=test_model(i)
                    posi_test_sent.append(simi_sent_level1)
                    nega_test_sent.append(simi_sent_level2)
                    posi_test_doc.append(simi_doc_level1)
                    nega_test_doc.append(simi_doc_level2)
                    posi_test_overall.append(simi_overall_level1)
                    nega_test_overall.append(simi_overall_level2)
                corr_test_sent=compute_corr(posi_test_sent, nega_test_sent)
                corr_test_doc=compute_corr(posi_test_doc, nega_test_doc)
                corr_test_overall=compute_corr(posi_test_overall, nega_test_overall)

                #write_file.close()
                #test_score = numpy.mean(test_losses)
                test_acc_sent=corr_test_sent*1.0/(test_size/3.0)
                test_acc_doc=corr_test_doc*1.0/(test_size/3.0)
                test_acc_overall=corr_test_overall*1.0/(test_size/3.0)
                #test_acc=1-test_score
#                 print(('\t\t\tepoch %i, minibatch %i/%i, test acc of best '
#                            'model %f %%') %
#                           (epoch, minibatch_index, n_train_batches,test_acc * 100.))
                print '\t\t\tepoch', epoch, ', minibatch', minibatch_index, '/', n_train_batches, 'test acc of best model', test_acc_sent*100,test_acc_doc*100,test_acc_overall*100
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')



                find_better=False
                if test_acc_sent > max_acc:
                    max_acc=test_acc_sent
                    best_epoch=epoch
                    find_better=True
                if test_acc_doc > max_acc:
                    max_acc=test_acc_doc
                    best_epoch=epoch
                    find_better=True
                if test_acc_overall > max_acc:
                    max_acc=test_acc_overall
                    best_epoch=epoch
                    find_better=True
                print '\t\t\tmax:',    max_acc,'(at',best_epoch,')'
                if find_better==True:
                    store_model_to_file(params, best_epoch, max_acc)
                    print 'Finished storing best params'

            if patience <= iter:
                done_looping = True
                break


        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()
        #writefile.close()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #16
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=500, test_batch_size=500, emb_size=10, hidden_size=10,
                    L2_weight=0.0001, margin=0.5,
                    train_size=4000000, test_size=1000, 
                    max_context_len=25, max_span_len=7, max_q_len=40, max_EM=0.0):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = np.random.RandomState(23455)
    word2id,train_questions,train_questions_mask,train_lefts,train_lefts_mask,train_spans,train_spans_mask,train_rights,train_rights_mask=load_SQUAD_hinrich(train_size, max_context_len, max_span_len, max_q_len)



    test_ground_truth,all_candidates_f1,test_questions,test_questions_mask,test_lefts,test_lefts_mask,test_spans,test_spans_mask,test_rights,test_rights_mask=load_dev_hinrich(word2id, test_size, max_context_len, max_span_len, max_q_len)
    
    

    overall_vocab_size=len(word2id)
    print 'vocab size:', overall_vocab_size


    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, np.random.RandomState(1234))
#     rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
#     id2word = {y:x for x,y in word2id.iteritems()}
#     word2vec=load_word2vec()
#     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()

    left=T.imatrix()  #(2*batch, len)
    left_mask=T.fmatrix() #(2*batch, len)
    span=T.imatrix()  #(2*batch, span_len)
    span_mask=T.fmatrix() #(2*batch, span_len)
    right=T.imatrix()  #(2*batch, len)
    right_mask=T.fmatrix() #(2*batch, len)
    q=T.imatrix()  #(2*batch, len_q)
    q_mask=T.fmatrix() #(2*batch, len_q)





    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    GRU1_para=[U1, W1, b1, U1_b, W1_b, b1_b]
    
    U2, W2, b2=create_GRU_para(rng, hidden_size, hidden_size)
    U2_b, W2_b, b2_b=create_GRU_para(rng, hidden_size, hidden_size)
    GRU2_para=[U2, W2, b2, U2_b, W2_b, b2_b]
    
    W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)

    attend_para=[W_a1, W_a2]
    params = [embeddings]+GRU1_para+attend_para+GRU2_para
#     load_model_from_file(rootPath+'Best_Para_dim'+str(emb_size), params)

    left_input = embeddings[left.flatten()].reshape((left.shape[0], left.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context)
    span_input = embeddings[span.flatten()].reshape((span.shape[0], span.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_span)
    right_input = embeddings[right.flatten()].reshape((right.shape[0], right.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context)
    q_input = embeddings[q.flatten()].reshape((q.shape[0], q.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_q)


    left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=left_input, Mask=left_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    left_reps=left_model.output_tensor #(batch, emb, para_len)

    span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=span_input, Mask=span_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    span_reps=span_model.output_tensor #(batch, emb, para_len)

    right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=right_input, Mask=right_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    right_reps=right_model.output_tensor #(batch, emb, para_len)

    q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    q_reps=q_model.output_tensor #(batch, emb, para_len)

    #interaction
    left_reps_via_q_reps, q_reps_via_left_reps=attention_dot_prod_between_2tensors(left_reps, q_reps)
    span_reps_via_q_reps, q_reps_via_span_reps=attention_dot_prod_between_2tensors(span_reps, q_reps)
    right_reps_via_q_reps, q_reps_via_right_reps=attention_dot_prod_between_2tensors(right_reps, q_reps)

#     q_reps_via_left_reps=attention_dot_prod_between_2tensors(q_reps, left_reps)
#     q_reps_via_span_reps=attention_dot_prod_between_2tensors(q_reps, span_reps)
#     q_reps_via_right_reps=attention_dot_prod_between_2tensors(q_reps, right_reps)

    #combine


    origin_W=normalize_matrix(W_a1)
    attend_W=normalize_matrix(W_a2)

    left_origin_reps=T.dot(left_reps.dimshuffle(0, 2,1), origin_W)
    span_origin_reps=T.dot(span_reps.dimshuffle(0, 2,1), origin_W)
    right_origin_reps=T.dot(right_reps.dimshuffle(0, 2,1), origin_W)
    q_origin_reps=T.dot(q_reps.dimshuffle(0, 2,1), origin_W)

    left_attend_q_reps=T.dot(q_reps_via_left_reps.dimshuffle(0, 2,1), attend_W)
    span_attend_q_reps=T.dot(q_reps_via_span_reps.dimshuffle(0, 2,1), attend_W)
    right_attend_q_reps=T.dot(q_reps_via_right_reps.dimshuffle(0, 2,1), attend_W)

    q_attend_left_reps=T.dot(left_reps_via_q_reps.dimshuffle(0, 2,1), attend_W)
    q_attend_span_reps=T.dot(span_reps_via_q_reps.dimshuffle(0, 2,1), attend_W)
    q_attend_right_reps=T.dot(right_reps_via_q_reps.dimshuffle(0, 2,1), attend_W)


    add_left=left_origin_reps+q_attend_left_reps  #(2*batch, len ,hidden)
    add_span=span_origin_reps+q_attend_span_reps
    add_right=right_origin_reps+q_attend_right_reps

    add_q_by_left=q_origin_reps+left_attend_q_reps
    add_q_by_span=q_origin_reps+span_attend_q_reps
    add_q_by_right=q_origin_reps+right_attend_q_reps

    #second GRU


    add_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_left.dimshuffle(0,2,1), Mask=left_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_left_reps=add_left_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_span.dimshuffle(0,2,1), Mask=span_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_span_reps=add_span_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_right.dimshuffle(0,2,1), Mask=right_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_right_reps=add_right_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_q_by_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_left.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_q_by_left_reps=add_q_by_left_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_q_by_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_span.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_q_by_span_reps=add_q_by_span_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_q_by_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_right.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_q_by_right_reps=add_q_by_right_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    paragraph_concat=T.concatenate([add_left_reps, add_span_reps, add_right_reps], axis=1) #(batch, 3*hidden)
    question_concat=T.concatenate([add_q_by_left_reps, add_q_by_span_reps, add_q_by_right_reps], axis=1)   #(batch, 3*hidden)

    simi_list=cosine_row_wise_twoMatrix(paragraph_concat, question_concat)  #(2*batch)

    pos_simi_vec=simi_list[::2]
    neg_simi_vec=simi_list[1::2]

    raw_loss=T.maximum(0.0, margin+neg_simi_vec-pos_simi_vec)



    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    
#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=T.sum(raw_loss)#+ConvGRU_1.error#


    accumulator=[]
    for para_i in params:
        eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))



    train_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], simi_list, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size    #batch_size means how many pairs
    remain_train=train_size%batch_size
#     train_batch_start=list(np.arange(n_train_batches)*batch_size*2)+[train_size*2-batch_size*2] # always ou shu
    if remain_train>0:
        train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] 
    else:
        train_batch_start=list(np.arange(n_train_batches)*batch_size)




    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_odd_ids = list(np.arange(train_size)*2)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        random.shuffle(train_odd_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            if iter%100==0:
                print 'iter:', iter
            iter_accu+=1
            train_id_list=[[train_odd_id, train_odd_id+1] for train_odd_id in train_odd_ids[para_id:para_id+batch_size]]
            train_id_list=sum(train_id_list,[])
#             print train_id_list
            cost_i+= train_model(
                                np.asarray([train_lefts[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_lefts_mask[id] for id in train_id_list], dtype=theano.config.floatX),
                                np.asarray([train_spans[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_spans_mask[id] for id in train_id_list], dtype=theano.config.floatX),
                                np.asarray([train_rights[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_rights_mask[id] for id in train_id_list], dtype=theano.config.floatX),
                                np.asarray([train_questions[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_questions_mask[id] for id in train_id_list], dtype=theano.config.floatX))

            #print iter
            if iter%100==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()

                exact_match=0.0
                F1_match=0.0


                for test_pair_id in range(test_size):
                    test_example_lefts=test_lefts[test_pair_id]
                    test_example_lefts_mask=test_lefts_mask[test_pair_id]
                    test_example_spans=test_spans[test_pair_id]
                    test_example_spans_mask=test_spans_mask[test_pair_id]
                    test_example_rights=test_rights[test_pair_id]
                    test_example_rights_mask=test_rights_mask[test_pair_id]
                    test_example_questions=test_questions[test_pair_id]
                    test_example_questions_mask=test_questions_mask[test_pair_id]       
                    test_example_candidates_f1=all_candidates_f1[test_pair_id]
                    
                    
                    
                    test_example_size=len(test_example_lefts)
#                     print 'test_pair_id, test_example_size:', test_pair_id, test_example_size
                    if test_example_size < test_batch_size:
                        #pad
                        pad_size=test_batch_size-test_example_size
                        test_example_lefts+=test_example_lefts[-1:]*pad_size
                        test_example_lefts_mask+=test_example_lefts_mask[-1:]*pad_size
                        test_example_spans+=test_example_spans[-1:]*pad_size
                        test_example_spans_mask+=test_example_spans_mask[-1:]*pad_size
                        test_example_rights+=test_example_rights[-1:]*pad_size
                        test_example_rights_mask+=test_example_rights_mask[-1:]*pad_size
                        test_example_questions+=test_example_questions[-1:]*pad_size
                        test_example_questions_mask+=test_example_questions_mask[-1:]*pad_size 
                        test_example_candidates_f1+=test_example_candidates_f1[-1:]*pad_size
                        
                        test_example_size=test_batch_size
                    
                                            
                    n_test_batches=test_example_size/test_batch_size
                    n_test_remain=test_example_size%test_batch_size
                    if n_test_remain > 0:
                        test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_example_size-test_batch_size]
                    else:
                        test_batch_start=list(np.arange(n_test_batches)*test_batch_size)
                    all_simi_list=[]
                    all_cand_list=[]
                    for test_para_id in test_batch_start:
                        simi_return_vector=test_model(
                                    np.asarray(test_example_lefts[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_lefts_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                    np.asarray(test_example_spans[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_spans_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                    np.asarray(test_example_rights[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_rights_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                    np.asarray(test_example_questions[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_questions_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX))
                        candidate_f1_list=test_example_candidates_f1[test_para_id:test_para_id+test_batch_size]
                        all_simi_list+=list(simi_return_vector)
                        all_cand_list+=candidate_f1_list
                    top1_f1=all_cand_list[np.argsort(all_simi_list)[-1]]
#                     print top1_cand, test_ground_truth[test_pair_id]

                    if top1_f1 == 1.0:
                        exact_match+=1
#                     F1=macrof1(top1_cand, test_ground_truth[test_pair_id])
#                     print '\t\t\t', F1
                    F1_match+=top1_f1
#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                F1_acc=F1_match/test_size
                exact_acc=exact_match/test_size
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
#                     store_model_to_file(params, emb_size)
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(rootPath+'Best_Para_'+str(max_exact_acc), params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc




            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #17
0
ファイル: test_KBE_Ramesh.py プロジェクト: yinwenpeng/KBEmb
def evaluate_lenet5(learning_rate=0.08, n_epochs=2000, nkerns=[50], batch_size=1000, window_width=4,
                    maxSentLength=64, emb_size=50, hidden_size=50,
                    margin=0.5, L2_weight=0.0004, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=483142, comment='v5_margin0.6_neg300_'):
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/FB15k/'
    rng = numpy.random.RandomState(1234)
    triples, entity_size, relation_size, train_triples_set, train_entity_set, train_relation_set,dev_triples, dev_triples_set, dev_entity_set, dev_relation_set, test_triples, test_triples_set, test_entity_set, test_relation_set=load_TrainDevTest_triples_RankingLoss(triple_path+'freebase_mtr100_mte100-train.txt',triple_path+'freebase_mtr100_mte100-valid.txt', triple_path+'freebase_mtr100_mte100-test.txt', line_no, triple_path)
    
    
    print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count)
    dev_size=len(dev_triples)
    print 'dev triple size:', dev_size, 'entity_size:', len(dev_entity_set)
    test_size=len(test_triples)
    print 'test triple size:', test_size, 'entity_size:', len(test_entity_set)
#     print triples
#     print entity_count
#     print relation_count
#     exit(0)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
#     mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
#     mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt')
#     wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

    
#     entity_count=theano.shared(numpy.asarray(entity_count, dtype=theano.config.floatX), borrow=True)
#     entity_count=T.cast(entity_count, 'int64')
#     relation_count=theano.shared(numpy.asarray(relation_count, dtype=theano.config.floatX), borrow=True)
#     relation_count=T.cast(relation_count, 'int64')    


    rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    entity_E=theano.shared(value=rand_values, borrow=True)      
    rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
    relation_E=theano.shared(value=rand_values, borrow=True)    
    
    GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=3) 
    
    para_to_load=[entity_E, relation_E, GRU_U, GRU_W, GRU_b]
    load_model_from_file(triple_path+comment+'Best_Paras_dim'+str(emb_size), para_to_load)
    norm_entity_E=norm_matrix(entity_E)
    norm_relation_E=norm_matrix(relation_E)
    
    n_batchs=line_no/batch_size
    remain_triples=line_no%batch_size
    if remain_triples>0:
        batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size]
    else:
        batch_start=list(numpy.arange(n_batchs)*batch_size)
    batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True)
    batch_start=T.cast(batch_start, 'int64')   
    

    test_triple = T.lvector('test_triple')  
    neg_inds = T.lvector('neg_inds')

    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    predicted_tail=GRU_Combine_2Vector(norm_entity_E[test_triple[0]], norm_relation_E[test_triple[1]], emb_size, GRU_U, GRU_W, GRU_b)
    golden_tail=norm_entity_E[test_triple[2]]
    pos_loss=(1-cosine(predicted_tail,golden_tail))**2
    neg_Es=norm_entity_E[neg_inds].reshape((neg_inds.shape[0], emb_size))
    predicted_tail=predicted_tail.reshape((1, emb_size))
    multi=T.sum(predicted_tail*neg_Es, axis=1)
    len1=T.sqrt(T.sum(predicted_tail**2))
    len2=T.sqrt(T.sum(neg_Es**2, axis=1))
    cos=multi/(len1*len2)
    neg_loss_vector=(1-cos)**2

#     normed_predicted_tail=predicted_tail/T.sqrt(T.sum(predicted_tail**2))
#     
#     pos_loss=T.sum(abs(normed_predicted_tail-golden_tail))
#     neg_Es=norm_entity_E[neg_inds].reshape((neg_inds.shape[0], emb_size))
#     predicted_tail=normed_predicted_tail.reshape((1, emb_size))
# 
#     neg_loss_vector=T.sum(abs(predicted_tail-neg_Es), axis=1)
   
    
    
    
    GRU_forward_step = theano.function([test_triple, neg_inds], [pos_loss,neg_loss_vector], on_unused_input='ignore')
    
# 
#     train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
#           givens={
#             x_index_l: indices_train_l[index: index + batch_size],
#             x_index_r: indices_train_r[index: index + batch_size],
#             y: trainY[index: index + batch_size],
#             left_l: trainLeftPad_l[index],
#             right_l: trainRightPad_l[index],
#             left_r: trainLeftPad_r[index],
#             right_r: trainRightPad_r[index],
#             length_l: trainLengths_l[index],
#             length_r: trainLengths_r[index],
#             norm_length_l: normalized_train_length_l[index],
#             norm_length_r: normalized_train_length_r[index],
#             mts: mt_train[index: index + batch_size],
#             wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
#     validation_frequency = min(n_train_batches/5, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0
    corpus_triples_set=train_triples_set|dev_triples_set|test_triples_set
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
#         cost_1, cost_l= train_model(triples)
#                 #print 'layer3_input', layer3_input
#         print 'cost:', cost_1, cost_l
        
        #test
        test_size=len(test_triples)
        hits_10=test_size
        hits_1=test_size
        
        co=0
        for test_triple in test_triples:
            co+=1

            count=0
            flag_continue=True
            nega_entity_set=get_negas(test_triple, corpus_triples_set, test_entity_set)
#             print len(nega_entity_set)
            p_loss, n_loss_vector=GRU_forward_step(test_triple, list(nega_entity_set))

            n_loss_vector=numpy.sort(n_loss_vector)
#             print p_loss
#             print n_loss_vector[:15]
#             exit(0)
            if p_loss>n_loss_vector[0]:
                hits_1-=1
            if p_loss>n_loss_vector[9]:
                hits_10-=1 
            if co%1000==0:
                print co, '...'
                print '\t\thits_10', hits_10*100.0/test_size, 'hits_1', hits_1*100.0/test_size
        hits_10=hits_10*100.0/test_size
        hits_1=hits_1*100.0/test_size
        
        
#             if patience <= iter:
#                 done_looping = True
#                 break
        #after each epoch, increase the batch_size
        store_model_to_file(triple_path+'Best_Paras_dim'+str(emb_size)+'_hits10_'+str(hits_10)[:6], para_to_load)
        print 'Finished storing best  params'
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min, Hits_10:',  hits_10, 'Hits_1:,', hits_1
        mid_time = time.clock()
        exit(0)
#         exit(0)
        
#         #store the paras after epoch 15
#         if epoch ==22:
#             store_model_to_file(params_conv)
#             print 'Finished storing best conv params'
#             exit(0)
            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #18
0
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[50,50], batch_size=10, window_width=3,
                    maxSentLength=64, emb_size=50, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, update_freq=1, norm_threshold=5.0, max_truncate=33):# max_truncate can be 45
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SICK/';
    rng = numpy.random.RandomState(23455)
#     datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'train.txt', rootPath+'test.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    datasets, vocab_size=load_SICK_corpus(rootPath+'vocab.txt', rootPath+'train_plus_dev.txt', rootPath+'test.txt', max_truncate,maxSentLength, entailment=True)
    mt_train, mt_test=load_mts_wikiQA(rootPath+'Train_plus_dev_MT/concate_14mt_train.txt', rootPath+'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test=load_extra_features(rootPath+'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt', rootPath+'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt')
    discri_train, discri_test=load_extra_features(rootPath+'train_plus_dev_discri_features_0.3.txt', rootPath+'test_discri_features_0.3.txt')
    
    
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    
    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad = datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    '''
    indices_train_l=T.cast(indices_train_l, 'int32')
    indices_train_r=T.cast(indices_train_r, 'int32')
    indices_test_l=T.cast(indices_test_l, 'int32')
    indices_test_r=T.cast(indices_test_r, 'int32')
    '''


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_glove_50d.txt')
#     rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_lower_in_word2vec_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    

    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lvector()
    right_l=T.lvector()
    left_r=T.lvector()
    right_r=T.lvector()
    length_l=T.lvector()
    length_r=T.lvector()
    norm_length_l=T.dvector()
    norm_length_r=T.dvector()
    mts=T.dmatrix()
    extra=T.dmatrix()
    discri=T.dmatrix()
    cost_tmp=T.dscalar()




#     #GPU
#     index = T.iscalar()
#     x_index_l = T.imatrix('x_index_l')   # now, x is the index matrix, must be integer
#     x_index_r = T.imatrix('x_index_r')
#     y = T.ivector('y')  
#     left_l=T.iscalar()
#     right_l=T.iscalar()
#     left_r=T.iscalar()
#     right_r=T.iscalar()
#     length_l=T.iscalar()
#     length_r=T.iscalar()
#     norm_length_l=T.fscalar()
#     norm_length_r=T.fscalar()
#     #mts=T.dmatrix()
#     #wmf=T.dmatrix()
#     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = debug_print(embeddings[x_index_l.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_l_input')
    layer0_r_input = debug_print(embeddings[x_index_r.flatten()].reshape((batch_size, maxSentLength, emb_size)).transpose(0,2,1), 'layer0_r_input')
    #paras:
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b]     
    U1, W1, b1=create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para=[U1, W1, b1] 
    def loop (l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i):   
        l_input_tensor=debug_print(Matrix_Bit_Shift(l_matrix[:,l_left:-l_right]), 'l_input_tensor')
        r_input_tensor=debug_print(Matrix_Bit_Shift(r_matrix[:,r_left:-r_right]), 'r_input_tensor')
        
        addition_l=T.sum(l_matrix[:,l_left:-l_right], axis=1)
        addition_r=T.sum(r_matrix[:,r_left:-r_right], axis=1)
        cosine_addition=cosine(addition_l, addition_r)
        eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2%
        
        layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
        
        cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
        eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2%
        
        attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2)
        
        l_max_attention=T.max(attention_matrix, axis=1)
        neighborsArgSorted = T.argsort(l_max_attention)
        kNeighborsArg = neighborsArgSorted[:3]#only average the min 3 vectors
        ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie
    
    
        r_max_attention=T.max(attention_matrix, axis=0)
        neighborsArgSorted_r = T.argsort(r_max_attention)
        kNeighborsArg_r = neighborsArgSorted_r[:3]#only average the min 3 vectors
        rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie
    
        
        l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention')
        r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention')
        

    
        layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
        layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
    
        vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l')
        vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r')
    
        
        
    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #     
        uni_cosine=cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
        eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #     
        len_l=norm_length_l_i.reshape((1,1))
        len_r=norm_length_r_i.reshape((1,1))  
    #     
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))  
    #     '''
        #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
        #length_gap=T.sqrt((len_l-len_r)**2)
        #layer3_input=mts
#         layer3_input_nn=T.concatenate([vec_l, vec_r,
#                                     cosine_addition, eucli_addition,
#     #                                 cosine_sent, eucli_sent,
#                                     uni_cosine,eucli_1], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
        
        output_i=T.concatenate([vec_l, vec_r,
                                    cosine_addition, eucli_addition,
    #                                 cosine_sent, eucli_sent,
                                    uni_cosine,eucli_1,
                                    mts_i.reshape((1,14)),
                                    len_l, len_r,
                                    extra_i.reshape((1,9))], axis=1)#, layer2.output, layer1.output_cosine], axis=1)    
        return output_i
    
    layer3_input, _ = theano.scan(fn=loop,
                            sequences=[left_l, right_l, layer0_l_input, left_r, right_r, layer0_r_input, mts, extra, norm_length_l, norm_length_r],
                            outputs_info=None,#[self.h0, None],
                            n_steps=batch_size)       
#l_left, l_right, l_matrix, r_left, r_right, r_matrix, mts_i, extra_i, norm_length_l_i, norm_length_r_i
#     x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
#     x_index_r = T.lmatrix('x_index_r')
#     y = T.lvector('y')  
#     left_l=T.lvector()
#     right_l=T.lvector()
#     left_r=T.lvector()
#     right_r=T.lvector()
#     length_l=T.lvector()
#     length_r=T.lvector()
#     norm_length_l=T.dvector()
#     norm_length_r=T.dvector()
#     mts=T.dmatrix()
#     extra=T.dmatrix()
#     discri=T.dmatrix()
#     cost_tmp=T.dscalar()

    
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    feature_size=2*nkerns[1]+2+2+14+2+9
    layer3_input=layer3_input.reshape((batch_size, feature_size))
    layer3=LogisticRegression(rng, input=layer3_input, n_in=feature_size, n_out=3)

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(U** 2).sum()+(W** 2).sum()+(U1** 2).sum()+(W1** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.errors(y),layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index: index + batch_size],
            right_l: testRightPad_l[index: index + batch_size],
            left_r: testLeftPad_r[index: index + batch_size],
            right_r: testRightPad_r[index: index + batch_size],
            length_l: testLengths_l[index: index + batch_size],
            length_r: testLengths_r[index: index + batch_size],
            norm_length_l: normalized_test_length_l[index: index + batch_size],
            norm_length_r: normalized_test_length_r[index: index + batch_size],
            mts: mt_test[index: index + batch_size],
            extra: extra_test[index: index + batch_size],
            discri:discri_test[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ layer1_para+layer0_para#+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
#       
#     # create a list of gradients for all model parameters
#     grads = T.grad(cost, params)
# 
#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))    

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
    
    updates=Adam(cost=cost, params=params, lr=learning_rate)
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index: index + batch_size],
            right_l: trainRightPad_l[index: index + batch_size],
            left_r: trainLeftPad_r[index: index + batch_size],
            right_r: trainRightPad_r[index: index + batch_size],
            length_l: trainLengths_l[index: index + batch_size],
            length_r: trainLengths_r[index: index + batch_size],
            norm_length_l: normalized_train_length_l[index: index + batch_size],
            norm_length_r: normalized_train_length_r[index: index + batch_size],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)

    train_model_predict = theano.function([index, cost_tmp], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index: index + batch_size],
            right_l: trainRightPad_l[index: index + batch_size],
            left_r: trainLeftPad_r[index: index + batch_size],
            right_r: trainRightPad_r[index: index + batch_size],
            length_l: trainLengths_l[index: index + batch_size],
            length_r: trainLengths_r[index: index + batch_size],
            norm_length_l: normalized_train_length_l[index: index + batch_size],
            norm_length_r: normalized_train_length_r[index: index + batch_size],
            mts: mt_train[index: index + batch_size],
            extra: extra_train[index: index + batch_size],
            discri:discri_train[index: index + batch_size]
            }, on_unused_input='ignore', allow_input_downcast=True)



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False
    
    acc_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
#         shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
#             if (batch_start+1)%1000==0:
#                 print batch_start+1,  'uses ', (time.time()-mid_time)/60.0, 'min'
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    test_loss, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y)
                    test_features.append(layer3_input)
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+
 
                #write_file.close()
                test_score = numpy.mean(test_losses)
                test_features=numpy.concatenate(test_features, axis=0)
                test_y=numpy.concatenate(test_y, axis=0)
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                           'model %f %%') %
                          (epoch, minibatch_index, n_train_batches,
                           (1-test_score) * 100.))
                acc_nn=1-test_score
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                 
                #this step is risky: if the training data is too big, then this step will make the training time twice longer
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start, 0.0)
                    train_y.append(y)
                    train_features.append(layer3_input)
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1
 
                train_features=numpy.concatenate(train_features, axis=0)
                train_y=numpy.concatenate(train_y, axis=0)
 
                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results=clf.predict(test_features)
                lr=linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                corr_count=0
                corr_count_lr=0
                test_size=len(test_y)
                for i in range(test_size):
                    if results[i]==test_y[i]:
                        corr_count+=1
                    if results_lr[i]==test_y[i]:
                        corr_count_lr+=1
                acc_svm=corr_count*1.0/test_size
                acc_lr=corr_count_lr*1.0/test_size
                if acc_svm > acc_max:
                    acc_max=acc_svm
                    best_epoch=epoch
                if acc_lr > acc_max:
                    acc_max=acc_lr
                    best_epoch=epoch
                if acc_nn > acc_max:
                    acc_max=acc_nn
                    best_epoch=epoch
                print  'acc_nn:', acc_nn, 'acc_lr:', acc_lr, 'acc_svm:', acc_svm, ' max acc: ',    acc_max , ' at epoch: ', best_epoch  

            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #19
0
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[256,256], batch_size=1, window_width=[4,4],
                    maxSentLength=64, emb_size=300, hidden_size=200,
                    margin=0.5, L2_weight=0.0006, Div_reg=0.06, update_freq=1, norm_threshold=5.0, max_truncate=40):
    maxSentLength=max_truncate+2*(window_width[0]-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/';
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt')
    wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0]
    indices_train_l=indices_train[::2,:]
    indices_train_r=indices_train[1::2,:]
    trainLengths_l=trainLengths[::2]
    trainLengths_r=trainLengths[1::2]
    normalized_train_length_l=normalized_train_length[::2]
    normalized_train_length_r=normalized_train_length[1::2]

    trainLeftPad_l=trainLeftPad[::2]
    trainLeftPad_r=trainLeftPad[1::2]
    trainRightPad_l=trainRightPad[::2]
    trainRightPad_r=trainRightPad[1::2]    
    indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1]
    indices_test_l=indices_test[::2,:]
    indices_test_r=indices_test[1::2,:]
    testLengths_l=testLengths[::2]
    testLengths_r=testLengths[1::2]
    normalized_test_length_l=normalized_test_length[::2]
    normalized_test_length_r=normalized_test_length[1::2]
    
    testLeftPad_l=testLeftPad[::2]
    testLeftPad_r=testLeftPad[1::2]
    testRightPad_l=testRightPad[::2]
    testRightPad_r=testRightPad[1::2]  

    n_train_batches=indices_train_l.shape[0]/batch_size
    n_test_batches=indices_test_l.shape[0]/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)

    
    indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True)
    indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True)
    indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True)
    indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True)
    indices_train_l=T.cast(indices_train_l, 'int64')
    indices_train_r=T.cast(indices_train_r, 'int64')
    indices_test_l=T.cast(indices_test_l, 'int64')
    indices_test_r=T.cast(indices_test_r, 'int64')
    


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_embs_300d.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      
    
    #cost_tmp=0
    error_sum=0
    
    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')  
    left_l=T.lscalar()
    right_l=T.lscalar()
    left_r=T.lscalar()
    right_r=T.lscalar()
    length_l=T.lscalar()
    length_r=T.lscalar()
    norm_length_l=T.dscalar()
    norm_length_r=T.dscalar()
    mts=T.dmatrix()
    wmf=T.dmatrix()
    cost_tmp=T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size=(emb_size,window_width[0])
    filter_size_2=(nkerns[0], window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape((maxSentLength, emb_size)).transpose()
    layer0_r_input = embeddings[x_index_r.flatten()].reshape((maxSentLength, emb_size)).transpose()
    
    
    l_input_tensor=debug_print(Matrix_Bit_Shift(layer0_l_input[:,left_l:-right_l]), 'l_input_tensor')
    r_input_tensor=debug_print(Matrix_Bit_Shift(layer0_r_input[:,left_r:-right_r]), 'r_input_tensor')
    
    addition_l=T.sum(layer0_l_input[:,left_l:-right_l], axis=1)
    addition_r=T.sum(layer0_r_input[:,left_r:-right_r], axis=1)
    cosine_addition=cosine(addition_l, addition_r)
    eucli_addition=1.0/(1.0+EUCLID(addition_l, addition_r))#25.2%
    
    U, W, b=create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para=[U, W, b] 

    layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0],U=U,W=W,b=b,bptt_truncate=-1)
    
    cosine_sent=cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
    eucli_sent=1.0/(1.0+EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep))#25.2%
    
    #ibm attentive pooling at extended sentence level
    attention_matrix=compute_simi_feature_matrix_with_matrix(layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength*(maxSentLength+1)/2)
#     attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose()
#     ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose()
#     attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose()
#     ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose()    
#     cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended)
#     eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2%      
    
    
    
    #ibm attentive pooling at original sentence level
    simi_matrix_sent=compute_simi_feature_matrix_with_matrix(layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates, length_l, length_r, maxSentLength)
    attention_vec_l=T.nnet.softmax(T.max(simi_matrix_sent, axis=1)).transpose()
    ibm_l=layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose()
    attention_vec_r=T.nnet.softmax(T.max(simi_matrix_sent, axis=0)).transpose()
    ibm_r=layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose()    
    cosine_ibm=cosine(ibm_l, ibm_r)
    eucli_ibm=1.0/(1.0+EUCLID(ibm_l, ibm_r))#25.2%    
    
    l_max_attention=T.max(attention_matrix, axis=1)
    neighborsArgSorted = T.argsort(l_max_attention)
    kNeighborsArg = neighborsArgSorted[-3:]#only average the max 3 vectors
    ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie


    r_max_attention=T.max(attention_matrix, axis=0)
    neighborsArgSorted_r = T.argsort(r_max_attention)
    kNeighborsArg_r = neighborsArgSorted_r[-3:]#only average the max 3 vectors
    rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie

    
    l_max_min_attention=debug_print(layer0_A1.output_matrix[:,ll], 'l_max_min_attention')
    r_max_min_attention=debug_print(layer0_A2.output_matrix[:,rr], 'r_max_min_attention')
    
    U1, W1, b1=create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para=[U1, W1, b1] 

    layer1_A1=GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)
    layer1_A2=GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1],U=U1,W=W1,b=b1,bptt_truncate=-1)

    vec_l=debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l')
    vec_r=debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r')

    
    
#     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
#     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
#     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
#     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
#     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
#     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
#     
    uni_cosine=cosine(vec_l, vec_r)
#     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
#     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')    
#     '''
#     linear=Linear(sum_uni_l, sum_uni_r)
#     poly=Poly(sum_uni_l, sum_uni_r)
#     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
#     rbf=RBF(sum_uni_l, sum_uni_r)
#     gesd=GESD(sum_uni_l, sum_uni_r)
#     '''
    eucli_1=1.0/(1.0+EUCLID(vec_l, vec_r))#25.2%
#     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
#     
    len_l=norm_length_l.reshape((1,1))
    len_r=norm_length_r.reshape((1,1))  
#     
#     '''
#     len_l=length_l.reshape((1,1))
#     len_r=length_r.reshape((1,1))  
#     '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input=T.concatenate([vec_l, vec_r,
                                uni_cosine,eucli_1,
                                cosine_addition, eucli_addition,
#                                 cosine_sent, eucli_sent,

                                ibm_l.reshape((1, nkerns[0])), ibm_r.reshape((1, nkerns[0])), #2*nkerns[0]+
                                cosine_ibm, eucli_ibm,
                                len_l, len_r,wmf
                                ], axis=1)#, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3=LogisticRegression(rng, input=layer3_input, n_in=(2*nkerns[1]+2)+2 +(2*nkerns[0]+2)+2+2, n_out=2)
    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((layer3.W** 2).sum()+(U** 2).sum()+(W** 2).sum()+(U1** 2).sum()+(W1** 2).sum(), 'L2_reg')#+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg= Diversify_Reg(layer3.W.T)+Diversify_Reg(U[0])+Diversify_Reg(W[0])+Diversify_Reg(U1[0])+Diversify_Reg(W1[0])+Diversify_Reg(U[1])+Diversify_Reg(W[1])+Diversify_Reg(U1[1])+Diversify_Reg(W1[1])+Diversify_Reg(U[2])+Diversify_Reg(W[2])+Diversify_Reg(U1[2])+Diversify_Reg(W1[2])
    cost_this =debug_print(layer3.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg
    cost=debug_print((cost_this+cost_tmp)/update_freq+L2_weight*L2_reg+Div_reg*diversify_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [layer3.prop_for_posi,layer3_input, y],
          givens={
            x_index_l: indices_test_l[index: index + batch_size],
            x_index_r: indices_test_r[index: index + batch_size],
            y: testY[index: index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index: index + batch_size],
            wmf: wm_test[index: index + batch_size]}, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params+ layer1_para+layer0_para#+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
#     accumulator=[]
#     for para_i in params:
#         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
#         accumulator.append(theano.shared(eps_p, borrow=True))
#         
#     # create a list of gradients for all model parameters
#     grads = T.grad(cost, params)
#   
#     updates = []
#     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         grad_i=debug_print(grad_i,'grad_i')
#         acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
#         updates.append((acc_i, acc))    

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates
      
    updates=Adam(cost=cost, params=params, lr=learning_rate)  
  
    train_model = theano.function([index,cost_tmp], cost, updates=updates,
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')

    train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
          givens={
            x_index_l: indices_train_l[index: index + batch_size],
            x_index_r: indices_train_r[index: index + batch_size],
            y: trainY[index: index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index: index + batch_size],
            wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp=0.0
        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
#             print batch_start
            if iter%update_freq != 0:
                cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp+=cost_ij
                error_sum+=error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average= train_model(batch_start,cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum=0
                cost_tmp=0.0#reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+' average cost: '+str(cost_average)+' error: '+str(error_sum)+'/'+str(update_freq)+' error rate: '+str(error_sum*1.0/update_freq)
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs=[]
                test_y=[]
                test_features=[]
                for i in test_batch_start:
                    prob_i, layer3_input, y=test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR=compute_map_mrr(rootPath+'test_filtered.txt', test_probs)
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                           'model %f, MRR  %f') %
                          (epoch, minibatch_index, n_train_batches,MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y=[]
                train_features=[]
                count=0
                for batch_start in train_batch_start: 
                    cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm=clf.decision_function(test_features)
                MAP_svm, MRR_svm=compute_map_mrr(rootPath+'test_filtered.txt', results_svm)
                
                lr=LinearRegression().fit(train_features, train_y)
                results_lr=lr.predict(test_features)
                MAP_lr, MRR_lr=compute_map_mrr(rootPath+'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #20
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    nkerns=[256, 256],
                    batch_size=1,
                    window_width=3,
                    maxSentLength=64,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0006,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=33):  # max_truncate can be 45
    maxSentLength = max_truncate + 2 * (window_width - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SICK/'
    rng = numpy.random.RandomState(23455)
    #     datasets, vocab_size=load_SICK_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'train.txt', rootPath+'test.txt', max_truncate,maxSentLength)#vocab_size contain train, dev and test
    datasets, vocab_size = load_SICK_corpus(rootPath + 'vocab.txt',
                                            rootPath + 'train_plus_dev.txt',
                                            rootPath + 'test.txt',
                                            max_truncate,
                                            maxSentLength,
                                            entailment=True)
    mt_train, mt_test = load_mts_wikiQA(
        rootPath + 'Train_plus_dev_MT/concate_14mt_train.txt',
        rootPath + 'Test_MT/concate_14mt_test.txt')
    extra_train, extra_test = load_extra_features(
        rootPath +
        'train_plus_dev_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt',
        rootPath +
        'test_rule_features_cosine_eucli_negation_len1_len2_syn_hyper1_hyper2_anto(newsimi0.4).txt'
    )
    discri_train, discri_test = load_extra_features(
        rootPath + 'train_plus_dev_discri_features_0.3.txt',
        rootPath + 'test_discri_features_0.3.txt')

    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]

    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int64')
    indices_train_r = T.cast(indices_train_r, 'int64')
    indices_test_l = T.cast(indices_test_l, 'int64')
    indices_test_r = T.cast(indices_test_r, 'int64')
    '''
    indices_train_l=T.cast(indices_train_l, 'int32')
    indices_train_r=T.cast(indices_train_r, 'int32')
    indices_test_l=T.cast(indices_test_l, 'int32')
    indices_test_r=T.cast(indices_test_r, 'int32')
    '''

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    #     rand_values=load_word2vec_to_init(rand_values, rootPath+'vocab_glove_50d.txt')
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_glove_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')
    left_l = T.lscalar()
    right_l = T.lscalar()
    left_r = T.lscalar()
    right_r = T.lscalar()
    length_l = T.lscalar()
    length_r = T.lscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    extra = T.dmatrix()
    discri = T.dmatrix()
    cost_tmp = T.dscalar()

    #     #GPU
    #     index = T.iscalar()
    #     x_index_l = T.imatrix('x_index_l')   # now, x is the index matrix, must be integer
    #     x_index_r = T.imatrix('x_index_r')
    #     y = T.ivector('y')
    #     left_l=T.iscalar()
    #     right_l=T.iscalar()
    #     left_r=T.iscalar()
    #     right_r=T.iscalar()
    #     length_l=T.iscalar()
    #     length_r=T.iscalar()
    #     norm_length_l=T.fscalar()
    #     norm_length_r=T.fscalar()
    #     #mts=T.dmatrix()
    #     #wmf=T.dmatrix()
    #     cost_tmp=T.fscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = debug_print(
        embeddings[x_index_l.flatten()].reshape(
            (maxSentLength, emb_size)).transpose(), 'layer0_l_input')
    layer0_r_input = debug_print(
        embeddings[x_index_r.flatten()].reshape(
            (maxSentLength, emb_size)).transpose(), 'layer0_r_input')

    l_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_l_input[:, left_l:-right_l]), 'l_input_tensor')
    r_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_r_input[:, left_r:-right_r]), 'r_input_tensor')

    addition_l = T.sum(layer0_l_input[:, left_l:-right_l], axis=1)
    addition_r = T.sum(layer0_r_input[:, left_r:-right_r], axis=1)
    cosine_addition = cosine(addition_l, addition_r)
    eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r))  #25.2%

    U, W, b = create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para = [U, W, b]

    layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)
    layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)

    cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
    eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep,
                                     layer0_A2.output_sent_rep))  #25.2%

    #ibm attentive pooling at extended sentence level
    attention_matrix = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim,
        layer0_A2.dim,
        maxSentLength * (maxSentLength + 1) / 2)
    #     attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose()
    #     ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose()
    #     attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose()
    #     ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose()
    #     cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended)
    #     eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2%

    #ibm attentive pooling at original sentence level
    simi_matrix_sent = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates,
        length_l, length_r, maxSentLength)
    attention_vec_l = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=1)).transpose()
    ibm_l = layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose()
    attention_vec_r = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=0)).transpose()
    ibm_r = layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose()
    cosine_ibm = cosine(ibm_l, ibm_r)
    eucli_ibm = 1.0 / (1.0 + EUCLID(ibm_l, ibm_r))  #25.2%

    l_max_attention = T.max(attention_matrix, axis=1)
    neighborsArgSorted = T.argsort(l_max_attention)
    kNeighborsArg = neighborsArgSorted[:3]  #only average the min 3 vectors
    ll = T.sort(kNeighborsArg).flatten()  # make y indices in acending lie

    r_max_attention = T.max(attention_matrix, axis=0)
    neighborsArgSorted_r = T.argsort(r_max_attention)
    kNeighborsArg_r = neighborsArgSorted_r[:3]  #only average the min 3 vectors
    rr = T.sort(kNeighborsArg_r).flatten()  # make y indices in acending lie

    l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll],
                                      'l_max_min_attention')
    r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr],
                                      'r_max_min_attention')

    U1, W1, b1 = create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para = [U1, W1, b1]

    layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)
    layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)

    vec_l = debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])),
                        'vec_l')
    vec_r = debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])),
                        'vec_r')

    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #
    uni_cosine = cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
    eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r))  #25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #
    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    #
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))
    #     '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input = T.concatenate(
        [
            vec_l,
            vec_r,
            uni_cosine,
            eucli_1,
            cosine_addition,
            eucli_addition,
            #                                 cosine_sent, eucli_sent,
            ibm_l.reshape((1, nkerns[0])),
            ibm_r.reshape((1, nkerns[0])),  #2*nkerns[0]+
            cosine_ibm,
            eucli_ibm,

            #                                 ibm_l_extended.reshape((1, nkerns[0])), ibm_r_extended.reshape((1, nkerns[0])), #2*nkerns[0]+
            #                                 cosine_ibm_extended, eucli_ibm_extended,
            mts,
            len_l,
            len_r,
            extra
        ],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)

    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3 = LogisticRegression(rng,
                                input=layer3_input,
                                n_in=(2 * nkerns[1] + 2) + 2 +
                                (2 * nkerns[0] + 2) + 14 + 2 + 9,
                                n_out=3)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer3.W**2).sum() + (U**2).sum() + (W**2).sum() + (U1**2).sum() +
        (W1**2).sum(), 'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()
    cost_this = debug_print(layer3.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print(
        (cost_this + cost_tmp) / update_freq + L2_weight * L2_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index], [layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size],
            extra: extra_test[index:index + batch_size],
            discri: discri_test[index:index + batch_size]
        },
        on_unused_input='ignore',
        allow_input_downcast=True)

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params + layer1_para + layer0_para  #+[embeddings]# + layer1.params
    #     params_conv = [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        updates.append(
            (param_i,
             param_i - learning_rate * grad_i / T.sqrt(acc)))  #AdaGrad
        updates.append((acc_i, acc))


#     def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
#         updates = []
#         grads = T.grad(cost, params)
#         i = theano.shared(numpy.float64(0.))
#         i_t = i + 1.
#         fix1 = 1. - (1. - b1)**i_t
#         fix2 = 1. - (1. - b2)**i_t
#         lr_t = lr * (T.sqrt(fix2) / fix1)
#         for p, g in zip(params, grads):
#             m = theano.shared(p.get_value() * 0.)
#             v = theano.shared(p.get_value() * 0.)
#             m_t = (b1 * g) + ((1. - b1) * m)
#             v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
#             g_t = m_t / (T.sqrt(v_t) + e)
#             p_t = p - (lr_t * g_t)
#             updates.append((m, m_t))
#             updates.append((v, v_t))
#             updates.append((p, p_t))
#         updates.append((i, i_t))
#         return updates
#
#     updates=Adam(cost=cost, params=params, lr=learning_rate)

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            extra: extra_train[index:index + batch_size],
            discri: discri_train[index:index + batch_size]
        },
        on_unused_input='ignore',
        allow_input_downcast=True)

    train_model_predict = theano.function(
        [index, cost_tmp],
        [cost_this, layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            extra: extra_train[index:index + batch_size],
            discri: discri_train[index:index + batch_size]
        },
        on_unused_input='ignore',
        allow_input_downcast=True)

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False

    acc_max = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #         shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            #             if (batch_start+1)%1000==0:
            #                 print batch_start+1,  'uses ', (time.time()-mid_time)/60.0, 'min'
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start, 0.0)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_losses = []
                test_y = []
                test_features = []
                for i in test_batch_start:
                    test_loss, layer3_input, y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_losses.append(test_loss)
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])
                    #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+

                #write_file.close()
                test_score = numpy.mean(test_losses)
                print(
                    ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test acc of best '
                     'model %f %%') % (epoch, minibatch_index, n_train_batches,
                                       (1 - test_score) * 100.))
                acc_nn = 1 - test_score
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')

                #this step is risky: if the training data is too big, then this step will make the training time twice longer
                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start, 0.0)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results = clf.predict(test_features)
                lr = linear_model.LogisticRegression(C=1e5)
                lr.fit(train_features, train_y)
                results_lr = lr.predict(test_features)
                corr_count = 0
                corr_count_lr = 0
                test_size = len(test_y)
                for i in range(test_size):
                    if results[i] == test_y[i]:
                        corr_count += 1
                    if results_lr[i] == test_y[i]:
                        corr_count_lr += 1
                acc_svm = corr_count * 1.0 / test_size
                acc_lr = corr_count_lr * 1.0 / test_size
                if acc_svm > acc_max:
                    acc_max = acc_svm
                    best_epoch = epoch
                if acc_lr > acc_max:
                    acc_max = acc_lr
                    best_epoch = epoch
                if acc_nn > acc_max:
                    acc_max = acc_nn
                    best_epoch = epoch
                print 'acc_nn:', acc_nn, 'acc_lr:', acc_lr, 'acc_svm:', acc_svm, ' max acc: ', acc_max, ' at epoch: ', best_epoch

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
コード例 #21
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=4, L2_weight=0.001, emb_size=70, batch_size=50, filter_size=3, maxSentLen=50, nn='CNN'):
    hidden_size=emb_size
    model_options = locals().copy()
    print "model options", model_options
    
    rng = np.random.RandomState(1234)    #random seed, control the model generates the same results 


    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r,all_labels, word2id  =load_SNLI_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_sents_l=np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l=np.asarray(all_sentences_l[2], dtype='int32')
    
    train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX)
    
    train_sents_r=np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    test_sents_r=np.asarray(all_sentences_r[2] , dtype='int32')
        
    train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX)
            
    train_labels_store=np.asarray(all_labels[0], dtype='int32')
    dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    test_labels_store=np.asarray(all_labels[2], dtype='int32')
        
    train_size=len(train_labels_store)
    dev_size=len(dev_labels_store)
    test_size=len(test_labels_store)
    
    vocab_size=len(word2id)+1
                    
    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
#     rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
#     id2word = {y:x for x,y in word2id.iteritems()}
#     word2vec=load_word2vec()
#     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable      
    
    
    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'    
    
    common_input_l=embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)) #the input format can be adapted into CNN or GRU or LSTM
    common_input_r=embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size))
    
    #conv
    if nn=='CNN':
        conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size))
        conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
        NN_para=[conv_W, conv_b]
         
        conv_input_l = common_input_l.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen)
        conv_model_l = Conv_with_input_para(rng, input=conv_input_l,
                 image_shape=(batch_size, 1, emb_size, maxSentLen),
                 filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b)
        conv_output_l=conv_model_l.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1)    
        conv_output_into_tensor3_l=conv_output_l.reshape((batch_size, hidden_size, maxSentLen-filter_size+1))
        mask_for_conv_output_l=T.repeat(sents_mask_l[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1)
        mask_for_conv_output_l=(1.0-mask_for_conv_output_l)*(mask_for_conv_output_l-10)
        masked_conv_output_l=conv_output_into_tensor3_l+mask_for_conv_output_l      #mutiple mask with the conv_out to set the features by UNK to zero
        sent_embeddings_l=T.max(masked_conv_output_l, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
     
        conv_input_r = common_input_r.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen)
        conv_model_r = Conv_with_input_para(rng, input=conv_input_r,
                 image_shape=(batch_size, 1, emb_size, maxSentLen),
                 filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b)
        conv_output_r=conv_model_r.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1)    
        conv_output_into_tensor3_r=conv_output_r.reshape((batch_size, hidden_size, maxSentLen-filter_size+1))
        mask_for_conv_output_r=T.repeat(sents_mask_r[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1)
        mask_for_conv_output_r=(1.0-mask_for_conv_output_r)*(mask_for_conv_output_r-10)
        masked_conv_output_r=conv_output_into_tensor3_r+mask_for_conv_output_r      #mutiple mask with the conv_out to set the features by UNK to zero
        sent_embeddings_r=T.max(masked_conv_output_r, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size   
    
     
    #GRU
    if nn=='GRU':
        U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
        NN_para=[U1, W1, b1]     #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
        gru_input_l = common_input_l.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
        gru_layer_l=GRU_Batch_Tensor_Input_with_Mask(gru_input_l, sents_mask_l,  hidden_size, U1, W1, b1)
        sent_embeddings_l=gru_layer_l.output_sent_rep  # (batch_size, hidden_size)
        gru_input_r = common_input_r.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
        gru_layer_r=GRU_Batch_Tensor_Input_with_Mask(gru_input_r, sents_mask_r,  hidden_size, U1, W1, b1)
        sent_embeddings_r=gru_layer_r.output_sent_rep  # (batch_size, hidden_size)


    #LSTM
    if nn=='LSTM':
        LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
        NN_para=LSTM_para_dict.values() # .values returns a list of parameters
        lstm_input_l = common_input_l.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU
        lstm_layer_l=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_l, sents_mask_l,  hidden_size, LSTM_para_dict)
        sent_embeddings_l=lstm_layer_l.output_sent_rep  # (batch_size, hidden_size)   
        lstm_input_r = common_input_r.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU
        lstm_layer_r=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_r, sents_mask_r,  hidden_size, LSTM_para_dict)
        sent_embeddings_r=lstm_layer_r.output_sent_rep  # (batch_size, hidden_size)      
    
    
    HL_layer_1_input = T.concatenate([sent_embeddings_l,sent_embeddings_r, sent_embeddings_l*sent_embeddings_r, cosine_matrix1_matrix2_rowwise(sent_embeddings_l,sent_embeddings_r).dimshuffle(0,'x')],axis=1)
    HL_layer_1_input_size = hidden_size*3+1
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size, activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size, n_out=hidden_size, activation=T.tanh)

    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    LR_input_size=HL_layer_1_input_size+2*hidden_size
    U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class  
    LR_para=[U_a, LR_b]
    
    LR_input=T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1)
    layer_LR=LogisticRegression(rng, input=T.tanh(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    
    params = [embeddings]+NN_para+LR_para+HL_layer_1.params+HL_layer_2.params   # put all model parameters together
#     L2_reg =L2norm_paraList([embeddings,conv_W, U_a])
#     diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix)

    cost=loss#+Div_reg*diversify_reg#+L2_weight*L2_reg
    
    grads = T.grad(cost, params)    # create a list of gradients for all model parameters
    accumulator=[]
    for para_i in params:
        eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #1e-8 is add to get rid of zero division
        updates.append((acc_i, acc))    


    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')    
    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    
    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    n_dev_batches=dev_size/batch_size
    dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]

        
    max_acc_dev=0.0
    max_acc_test=0.0
    
    while epoch < n_epochs:
        epoch = epoch + 1
        train_indices = range(train_size)
        random.Random(200).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        iter_accu=0
        cost_i=0.0
        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]
            cost_i+= train_model(
                                train_sents_l[train_id_batch], 
                                train_masks_l[train_id_batch],
                                train_sents_r[train_id_batch], 
                                train_masks_r[train_id_batch],                                
                                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter%500==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()
#             if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0:
#                 print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
#                 past_time = time.time()

                error_sum=0.0
                for dev_batch_id in dev_batch_start: # for each test batch
                    error_i=dev_model(
                                dev_sents_l[dev_batch_id:dev_batch_id+batch_size], 
                                dev_masks_l[dev_batch_id:dev_batch_id+batch_size],
                                dev_sents_r[dev_batch_id:dev_batch_id+batch_size], 
                                dev_masks_r[dev_batch_id:dev_batch_id+batch_size],                                
                                dev_labels_store[dev_batch_id:dev_batch_id+batch_size]
                                )
                    
                    error_sum+=error_i
                dev_accuracy=1.0-error_sum/(len(dev_batch_start))
                if dev_accuracy > max_acc_dev:
                    max_acc_dev=dev_accuracy
                    print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev
                    #best dev model, do test
                    error_sum=0.0
                    for test_batch_id in test_batch_start: # for each test batch
                        error_i=test_model(
                                test_sents_l[test_batch_id:test_batch_id+batch_size], 
                                test_masks_l[test_batch_id:test_batch_id+batch_size],
                                test_sents_r[test_batch_id:test_batch_id+batch_size], 
                                test_masks_r[test_batch_id:test_batch_id+batch_size],                                
                                test_labels_store[test_batch_id:test_batch_id+batch_size]
                                )
                        
                        error_sum+=error_i
                    test_accuracy=1.0-error_sum/(len(test_batch_start))
                    if test_accuracy > max_acc_test:
                        max_acc_test=test_accuracy
                    print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test
                else:
                    print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev

        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
                    
    return max_acc_test