예제 #1
0
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=100, filter_size=[3,3], maxSentLen=40, hidden_size=[300,300], max_term_len=4, p_mode = 'conc'):

    model_options = locals().copy()
    print "model options", model_options

    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results


    # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id  =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len)  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    word2id = load_word2id(root_dic+'LenciBenotto_word2id.pkl')
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id, group_size_list = load_task_hyper_vs_all_with_allDefComb(LenciBenotto_file,maxSentLen, word2id, wordlen=max_term_len)



    test_sents_l=np.asarray(test_sents_l, dtype='int32')

    test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX)


    test_sents_r=np.asarray(test_sents_r, dtype='int32')


    test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX)


    test_word1=np.asarray(test_word1, dtype='int32')
    test_word2=np.asarray(test_word2, dtype='int32')


    test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX)


    test_labels_store=np.asarray(test_labels, dtype='int32')

    test_extra=np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size=len(test_sents_l)
    print ' test size: ', test_size

    vocab_size=len(word2id)+1


    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
    # load_model_from_file(root_dic+'Weeds_best_para_init_embeddings', [init_embeddings])

    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1)
    word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1)


    '''create_AttentiveConv_params '''
    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]))
    conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para=[conv_W, conv_b,conv_W_context]

    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = word1_mask,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1)

    term_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_input_r,
             mask_matrix = word1_mask,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1)


    def_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = sents_mask_l,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1)


    def_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_input_r,
             mask_matrix = sents_mask_l,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1)

    if p_mode == 'conc':
        p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4*4*hidden_size[1]
    else:
        p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0)
        p_len =4*hidden_size[1]

    # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    # HL_input_size=p_len+1+1
    #
    # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh)


    "form input to LR classifier"
    LR_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    LR_input_size=p_len+1+1
    # LR_input = HL_layer_1.output
    # LR_input_size = hidden_size[1]

    U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a, LR_b]


    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b, bias=0.25) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.


    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()





    params = NN_para+LR_para #[init_embeddings]


    # load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params)
    load_model_from_file(root_dic+'LenciBenotto_best_para_0.557286573332', params)

    '''
    0.552587544259; current ap: 0.574037513126 ap@100 0.918481316424
    0.557286573332; current ap: 0.576498645289 ap@100 0.909032657538
    '''

    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    n_test_batches=test_size/batch_size
    n_test_remain = test_size%batch_size
    if n_test_remain!=0:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]
    else:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)



    # max_acc_dev=0.0
    max_ap_test=0.0
    max_ap_topk_test=0.0
    max_f1=0.0


    pred_labels =[]
    probs = []
    gold_labels =[]
    error_sum=0.0
    for idd, test_batch_id in enumerate(test_batch_start): # for each test batch
        pred_i, prob_i=test_model(
                test_sents_l[test_batch_id:test_batch_id+batch_size],
                test_masks_l[test_batch_id:test_batch_id+batch_size],
                test_sents_r[test_batch_id:test_batch_id+batch_size],
                test_masks_r[test_batch_id:test_batch_id+batch_size],
                test_word1[test_batch_id:test_batch_id+batch_size],
                test_word2[test_batch_id:test_batch_id+batch_size],
                test_word1_mask[test_batch_id:test_batch_id+batch_size],
                test_word2_mask[test_batch_id:test_batch_id+batch_size],
                test_extra[test_batch_id:test_batch_id+batch_size])

        # error_sum+=error_i
        pred_labels+=list(pred_i)
        probs+=list(prob_i)

    print len(test_sents_l), len(probs)
    if n_test_remain !=0:
        probs = probs[:(len(test_batch_start)-1)*batch_size]+probs[-n_test_remain:]
    print len(test_sents_l), len(probs)
    assert len(test_sents_l) == len(probs)
    assert sum(group_size_list) == len(probs)
    #max prob in group
    max_probs = []
    prior_size = 0
    for i in range(len(group_size_list)):

        sub_probs = probs[prior_size:prior_size+group_size_list[i]]
        prior_size += group_size_list[i]
        max_probs.append(max(sub_probs))

    print len(group_size_list),len(max_probs),len(test_labels)
    assert len(test_labels) == len(max_probs)
    # test_acc=1.0-error_sum/(len(test_batch_start))
    test_ap = apk(test_labels, max_probs, k=len(test_labels))
    test_ap_top100 = apk(test_labels, max_probs, k=100)


    # if test_ap > max_ap_test:
    #     max_ap_test=test_ap
    #     store_model_to_file('/save/wenpeng/datasets/EVALution/HyperDef_label_4ways_conc_test_on_EVA_allDefComb_best_para_'+str(max_ap_test), params)
    # if test_ap_top100 > max_ap_topk_test:
    #     max_ap_topk_test=test_ap_top100
    print '\t\tcurrent ap:', test_ap,'ap@100', test_ap_top100
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=0.0000001, extra_size=4, use_svm=False, para_filename='',drop_p=0.1, div_weight=0.00001, emb_size=300, batch_size=50, filter_size=[3,3], maxSentLen=40, hidden_size=[300,300], margin =0.1, comment='five copies, sum&majority'):

    model_options = locals().copy()
#     print "model options", model_options


        





    #now, start to build the input form of the model
    train_flag = T.iscalar()
    first_sents_ids_l=T.imatrix()
    first_sents_mask_l=T.fmatrix()
    first_sents_ids_r=T.imatrix()
    first_sents_mask_r=T.fmatrix()
    first_labels=T.ivector()
    second_sents_ids_l=T.imatrix()
    second_sents_mask_l=T.fmatrix()
    second_sents_ids_r=T.imatrix()
    second_sents_mask_r=T.fmatrix()
    second_labels=T.ivector()
    third_sents_ids_l=T.imatrix()
    third_sents_mask_l=T.fmatrix()
    third_sents_ids_r=T.imatrix()
    third_sents_mask_r=T.fmatrix()
    third_labels=T.ivector()
    fourth_sents_ids_l=T.imatrix()
    fourth_sents_mask_l=T.fmatrix()
    fourth_sents_ids_r=T.imatrix()
    fourth_sents_mask_r=T.fmatrix()
    fourth_labels=T.ivector()
    fifth_sents_ids_l=T.imatrix()
    fifth_sents_mask_l=T.fmatrix()
    fifth_sents_ids_r=T.imatrix()
    fifth_sents_mask_r=T.fmatrix()
    fifth_labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    
    def common_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    first_common_input_l=common_input(first_embeddings, first_sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    first_common_input_r=common_input(first_embeddings, first_sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    second_common_input_l=common_input(second_embeddings, second_sents_ids_l)#second_embeddings[second_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    second_common_input_r=common_input(second_embeddings, second_sents_ids_r)#second_embeddings[second_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    third_common_input_l=common_input(third_embeddings, third_sents_ids_l)#third_embeddings[third_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    third_common_input_r=common_input(third_embeddings, third_sents_ids_r)#third_embeddings[third_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    fourth_common_input_l=common_input(fourth_embeddings, fourth_sents_ids_l)#fourth_embeddings[fourth_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    fourth_common_input_r=common_input(fourth_embeddings, fourth_sents_ids_r)#fourth_embeddings[fourth_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    fifth_common_input_l=common_input(fifth_embeddings, fifth_sents_ids_l)#fifth_embeddings[fifth_sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    fifth_common_input_r=common_input(fifth_embeddings, fifth_sents_ids_r)#fifth_embeddings[fifth_sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)




    gate_filter_shape=(hidden_size[0], 1, emb_size, 1)
    def create_CNN_params(rng):
        conv_W_2_pre, conv_b_2_pre=create_conv_para(rng, filter_shape=gate_filter_shape)
        conv_W_2_gate, conv_b_2_gate=create_conv_para(rng, filter_shape=gate_filter_shape)
        conv_W_2, conv_b_2=create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]))
        conv_W_2_context, conv_b_2_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, hidden_size[0], 1))
        return conv_W_2_pre, conv_b_2_pre,conv_W_2_gate, conv_b_2_gate,conv_W_2, conv_b_2,conv_W_2_context, conv_b_2_context
    
    first_conv_W_pre, first_conv_b_pre,first_conv_W_gate, first_conv_b_gate,first_conv_W, first_conv_b,first_conv_W_context, first_conv_b_context = create_CNN_params(first_rng)
    second_conv_W_pre, second_conv_b_pre,second_conv_W_gate, second_conv_b_gate,second_conv_W, second_conv_b,second_conv_W_context, second_conv_b_context = create_CNN_params(second_rng)
    third_conv_W_pre, third_conv_b_pre,third_conv_W_gate, third_conv_b_gate,third_conv_W, third_conv_b,third_conv_W_context, third_conv_b_context = create_CNN_params(third_rng)
    fourth_conv_W_pre, fourth_conv_b_pre,fourth_conv_W_gate, fourth_conv_b_gate,fourth_conv_W, fourth_conv_b,fourth_conv_W_context, fourth_conv_b_context = create_CNN_params(fourth_rng)
    fifth_conv_W_pre, fifth_conv_b_pre,fifth_conv_W_gate, fifth_conv_b_gate,fifth_conv_W, fifth_conv_b,fifth_conv_W_context, fifth_conv_b_context = create_CNN_params(fifth_rng)

    '''
    dropout paras
    '''
    def dropout_group(rng, conv_W_2_pre, conv_W_2_gate, conv_W_2, conv_W_2_context):
        drop_conv_W_2_pre = dropout_layer(rng, conv_W_2_pre, drop_p, train_flag)
        drop_conv_W_2_gate = dropout_layer(rng, conv_W_2_gate, drop_p, train_flag)
        drop_conv_W_2 = dropout_layer(rng, conv_W_2, drop_p, train_flag)
        drop_conv_W_2_context = dropout_layer(rng, conv_W_2_context, drop_p, train_flag)
        return drop_conv_W_2_pre,drop_conv_W_2_gate,drop_conv_W_2,drop_conv_W_2_context
    drop_first_conv_W_pre,drop_first_conv_W_gate,drop_first_conv_W,drop_first_conv_W_context = dropout_group(first_srng, first_conv_W_pre, first_conv_W_gate, first_conv_W, first_conv_W_context)
    drop_second_conv_W_pre,drop_second_conv_W_gate,drop_second_conv_W,drop_second_conv_W_context = dropout_group(second_srng, second_conv_W_pre, second_conv_W_gate, second_conv_W, second_conv_W_context)
    drop_third_conv_W_pre,drop_third_conv_W_gate,drop_third_conv_W,drop_third_conv_W_context = dropout_group(third_srng, third_conv_W_pre, third_conv_W_gate, third_conv_W, third_conv_W_context)
    drop_fourth_conv_W_pre,drop_fourth_conv_W_gate,drop_fourth_conv_W,drop_fourth_conv_W_context = dropout_group(fourth_srng, fourth_conv_W_pre, fourth_conv_W_gate, fourth_conv_W, fourth_conv_W_context)
    drop_fifth_conv_W_pre,drop_fifth_conv_W_gate,drop_fifth_conv_W,drop_fifth_conv_W_context = dropout_group(fifth_srng, fifth_conv_W_pre, fifth_conv_W_gate, fifth_conv_W, fifth_conv_W_context)
    
    first_NN_para=[#conv_W, conv_b,
            first_conv_W_pre, first_conv_b_pre,
            first_conv_W_gate, first_conv_b_gate,
            first_conv_W, first_conv_b,first_conv_W_context]
    second_NN_para=[
            second_conv_W_pre, second_conv_b_pre,
            second_conv_W_gate, second_conv_b_gate,
            second_conv_W, second_conv_b,second_conv_W_context]

    third_NN_para=[
            third_conv_W_pre, third_conv_b_pre,
            third_conv_W_gate, third_conv_b_gate,
            third_conv_W, third_conv_b,third_conv_W_context]
    fourth_NN_para=[
            fourth_conv_W_pre, fourth_conv_b_pre,
            fourth_conv_W_gate, fourth_conv_b_gate,
            fourth_conv_W, fourth_conv_b,fourth_conv_W_context]

    fifth_NN_para=[
            fifth_conv_W_pre, fifth_conv_b_pre,
            fifth_conv_W_gate, fifth_conv_b_gate,
            fifth_conv_W, fifth_conv_b,fifth_conv_W_context]
    '''
    first classifier
    '''
    def classifier(rng,common_input_l,common_input_r,sents_mask_l, sents_mask_r,drop_conv_W_2_pre,conv_b_2_pre,drop_conv_W_2_gate,conv_b_2_gate,drop_conv_W_2,conv_b_2,drop_conv_W_2_context,
                   conv_b_2_context,labels):
        conv_layer_2_gate_l = Conv_with_Mask_with_Gate(rng, input_tensor3=common_input_l,
                 mask_matrix = sents_mask_l,
                 image_shape=(batch_size, 1, emb_size, maxSentLen),
                 filter_shape=gate_filter_shape,
                 W=drop_conv_W_2_pre, b=conv_b_2_pre,
                 W_gate =drop_conv_W_2_gate, b_gate=conv_b_2_gate )
        conv_layer_2_gate_r = Conv_with_Mask_with_Gate(rng, input_tensor3=common_input_r,
                 mask_matrix = sents_mask_r,
                 image_shape=(batch_size, 1, emb_size, maxSentLen),
                 filter_shape=gate_filter_shape,
                 W=drop_conv_W_2_pre, b=conv_b_2_pre,
                 W_gate =drop_conv_W_2_gate, b_gate=conv_b_2_gate )
    
        l_input_4_att = conv_layer_2_gate_l.output_tensor3#conv_layer_2_gate_l.masked_conv_out_sigmoid*conv_layer_2_pre_l.masked_conv_out+(1.0-conv_layer_2_gate_l.masked_conv_out_sigmoid)*common_input_l
        r_input_4_att = conv_layer_2_gate_r.output_tensor3#conv_layer_2_gate_r.masked_conv_out_sigmoid*conv_layer_2_pre_r.masked_conv_out+(1.0-conv_layer_2_gate_r.masked_conv_out_sigmoid)*common_input_r
    
        conv_layer_2 = Conv_for_Pair(rng,
                origin_input_tensor3=common_input_l,
                origin_input_tensor3_r = common_input_r,
                input_tensor3=l_input_4_att,
                input_tensor3_r = r_input_4_att,
                 mask_matrix = sents_mask_l,
                 mask_matrix_r = sents_mask_r,
                 image_shape=(batch_size, 1, hidden_size[0], maxSentLen),
                 image_shape_r = (batch_size, 1, hidden_size[0], maxSentLen),
                 filter_shape=(hidden_size[1], 1, hidden_size[0], filter_size[0]),
                 filter_shape_context=(hidden_size[1], 1,hidden_size[0], 1),
                 W=drop_conv_W_2, b=conv_b_2,
                 W_context=drop_conv_W_2_context, b_context=conv_b_2_context)
        attentive_sent_embeddings_l_2 = conv_layer_2.attentive_maxpool_vec_l
        attentive_sent_embeddings_r_2 = conv_layer_2.attentive_maxpool_vec_r
        # attentive_sent_sumpool_l_2 = conv_layer_2.attentive_sumpool_vec_l
        # attentive_sent_sumpool_r_2 = conv_layer_2.attentive_sumpool_vec_r
    
        HL_layer_1_input = T.concatenate([attentive_sent_embeddings_l_2,attentive_sent_embeddings_r_2, attentive_sent_embeddings_l_2*attentive_sent_embeddings_r_2],axis=1)
    
        HL_layer_1_input_size = hidden_size[1]*3#+extra_size#+(maxSentLen*2+10*2)#+hidden_size[1]*3+1
    
        HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.nnet.relu)
        HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[0], activation=T.nnet.relu)

        LR_input_size=HL_layer_1_input_size+2*hidden_size[0]
        U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2
        LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
        LR_para=[U_a, LR_b]
    
        LR_input=T.tanh(T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1))
        layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
        loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

        return loss, LR_para+HL_layer_1.params+HL_layer_2.params, layer_LR.p_y_given_x, layer_LR.errors(labels)
    
    first_loss, first_classifier_params, first_test_distr, first_error = classifier(first_rng,first_common_input_l,first_common_input_r,first_sents_mask_l,first_sents_mask_r,drop_first_conv_W_pre,first_conv_b_pre,
                                                                       drop_first_conv_W_gate,first_conv_b_gate,drop_first_conv_W,first_conv_b,drop_first_conv_W_context,
                                                                       first_conv_b_context, first_labels)
    second_loss, second_classifier_params, second_test_distr, second_error = classifier(second_rng,second_common_input_l,second_common_input_r,second_sents_mask_l,second_sents_mask_r,drop_second_conv_W_pre,second_conv_b_pre,
                                                                       drop_second_conv_W_gate,second_conv_b_gate,drop_second_conv_W,second_conv_b,drop_second_conv_W_context,
                                                                       second_conv_b_context, second_labels)
    third_loss, third_classifier_params, third_test_distr, third_error = classifier(third_rng,third_common_input_l,third_common_input_r,third_sents_mask_l,third_sents_mask_r,drop_third_conv_W_pre,third_conv_b_pre,
                                                                       drop_third_conv_W_gate,third_conv_b_gate,drop_third_conv_W,third_conv_b,drop_third_conv_W_context,
                                                                       third_conv_b_context, third_labels)
    fourth_loss, fourth_classifier_params, fourth_test_distr, fourth_error = classifier(fourth_rng,fourth_common_input_l,fourth_common_input_r,fourth_sents_mask_l,fourth_sents_mask_r,drop_fourth_conv_W_pre,fourth_conv_b_pre,
                                                                       drop_fourth_conv_W_gate,fourth_conv_b_gate,drop_fourth_conv_W,fourth_conv_b,drop_fourth_conv_W_context,
                                                                       fourth_conv_b_context, fourth_labels)
    fifth_loss, fifth_classifier_params, fifth_test_distr, fifth_error = classifier(fifth_rng,fifth_common_input_l,fifth_common_input_r,fifth_sents_mask_l,fifth_sents_mask_r,drop_fifth_conv_W_pre,fifth_conv_b_pre,
                                                                       drop_fifth_conv_W_gate,fifth_conv_b_gate,drop_fifth_conv_W,fifth_conv_b,drop_fifth_conv_W_context,
                                                                       fifth_conv_b_context, fifth_labels)


    '''
    testing, labels == second_labels
    '''
    all_prop_distr = first_test_distr+second_test_distr+third_test_distr+fourth_test_distr+fifth_test_distr
#     first_preds = T.argmax(first_test_distr, axis=1).dimshuffle('x',0) #(1, batch)
#     second_preds = T.argmax(second_test_distr, axis=1).dimshuffle('x',0) #(1, batch)
#     third_preds = T.argmax(third_test_distr, axis=1).dimshuffle('x',0) #(1, batch)
#     fourth_preds = T.argmax(fourth_test_distr, axis=1).dimshuffle('x',0) #(1, batch)
#     fifth_preds = T.argmax(fifth_test_distr, axis=1).dimshuffle('x',0) #(1, batch)
#     overall_preds = T.concatenate([first_preds,second_preds,third_preds,fourth_preds,fifth_preds], axis=0) #(5, batch)
#     all_error = T.mean(T.neq(T.argmax(all_prop_distr, axis=1), first_labels))



#     neg_labels = T.where( labels < 2, 2, labels-1)
#     loss2=-T.mean(T.log(1.0/(1.0+layer_LR.p_y_given_x))[T.arange(neg_labels.shape[0]), neg_labels])

    # rank loss
    # entail_prob_batch = T.nnet.softmax(layer_LR.before_softmax.T)[2] #batch
    # entail_ids = elementwise_is_two(labels)
    # entail_probs = entail_prob_batch[entail_ids.nonzero()]
    # non_entail_probs = entail_prob_batch[(1-entail_ids).nonzero()]
    #
    # repeat_entail = T.extra_ops.repeat(entail_probs, non_entail_probs.shape[0], axis=0)
    # repeat_non_entail = T.extra_ops.repeat(non_entail_probs.dimshuffle('x',0), entail_probs.shape[0], axis=0).flatten()
    # loss2 = -T.mean(T.log(entail_probs))#T.mean(T.maximum(0.0, margin-repeat_entail+repeat_non_entail))

    # zero_matrix = T.zeros((batch_size, 3))
    # filled_zero_matrix = T.set_subtensor(zero_matrix[T.arange(batch_size), labels], 1.0)
    # prob_batch_posi = layer_LR.p_y_given_x[filled_zero_matrix.nonzero()]
    # prob_batch_nega = layer_LR.p_y_given_x[(1-filled_zero_matrix).nonzero()]
    #
    # repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0)
    # repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten()
    # loss2 = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega))

    
    first_params = [first_embeddings]+first_NN_para+first_classifier_params
    second_params = [second_embeddings]+second_NN_para+second_classifier_params
    third_params = [third_embeddings]+third_NN_para+third_classifier_params
    fourth_params = [fourth_embeddings]+fourth_NN_para+fourth_classifier_params
    fifth_params = [fifth_embeddings]+fifth_NN_para+fifth_classifier_params
    
    params = first_params+second_params+third_params+fourth_params+fifth_params
    load_model_from_file(para_filename, params)
    

    

    test_model = theano.function([train_flag,first_sents_ids_l, first_sents_mask_l, first_sents_ids_r, first_sents_mask_r, first_labels,
                                  second_sents_ids_l,second_sents_mask_l,second_sents_ids_r,second_sents_mask_r,second_labels,
                                  third_sents_ids_l,third_sents_mask_l,third_sents_ids_r,third_sents_mask_r,third_labels,
                                  fourth_sents_ids_l,fourth_sents_mask_l,fourth_sents_ids_r,fourth_sents_mask_r,fourth_labels,
                                  fifth_sents_ids_l,fifth_sents_mask_l,fifth_sents_ids_r,fifth_sents_mask_r,fifth_labels], all_prop_distr, allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... testing'
    # early-stopping parameters

    start_time = time.time()
    mid_time = start_time


#     n_train_batches=train_size/batch_size
#     train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
#     n_dev_batches=dev_size/batch_size
#     dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]



#     train_indices = range(train_size)
#     para_filenames=['model_para_0.846294416244','model_para_0.845279187817', 'model_para_0.839695431472']
    gold_ys= []
    distr_list=[]
    for test_batch_id in test_batch_start: # for each test batch
        distr_batch=test_model(
                            0,
                            test_sents_l[test_batch_id:test_batch_id+batch_size],
                            test_masks_l[test_batch_id:test_batch_id+batch_size],
                            test_sents_r[test_batch_id:test_batch_id+batch_size],
                            test_masks_r[test_batch_id:test_batch_id+batch_size],
                            test_labels_store[test_batch_id:test_batch_id+batch_size],
                            
                            test_sents_l[test_batch_id:test_batch_id+batch_size],
                            test_masks_l[test_batch_id:test_batch_id+batch_size],
                            test_sents_r[test_batch_id:test_batch_id+batch_size],
                            test_masks_r[test_batch_id:test_batch_id+batch_size],
                            test_labels_store[test_batch_id:test_batch_id+batch_size],
                            
                            test_sents_l[test_batch_id:test_batch_id+batch_size],
                            test_masks_l[test_batch_id:test_batch_id+batch_size],
                            test_sents_r[test_batch_id:test_batch_id+batch_size],
                            test_masks_r[test_batch_id:test_batch_id+batch_size],
                            test_labels_store[test_batch_id:test_batch_id+batch_size],

                            test_sents_l[test_batch_id:test_batch_id+batch_size],
                            test_masks_l[test_batch_id:test_batch_id+batch_size],
                            test_sents_r[test_batch_id:test_batch_id+batch_size],
                            test_masks_r[test_batch_id:test_batch_id+batch_size],
                            test_labels_store[test_batch_id:test_batch_id+batch_size],
                            
                            test_sents_l[test_batch_id:test_batch_id+batch_size],
                            test_masks_l[test_batch_id:test_batch_id+batch_size],
                            test_sents_r[test_batch_id:test_batch_id+batch_size],
                            test_masks_r[test_batch_id:test_batch_id+batch_size],
                            test_labels_store[test_batch_id:test_batch_id+batch_size]
                )
        gold_ys.append(test_labels_store[test_batch_id:test_batch_id+batch_size])
        distr_list.append(distr_batch)
    distr_file = np.concatenate(distr_list, axis=0)
    gold_ys = np.concatenate(gold_ys)



    return distr_file, gold_ys
예제 #3
0
def evaluate_lenet5(claim, title2sentlist, title2wordlist, word2id):
    learning_rate = 0.02
    n_epochs = 100
    emb_size = 300
    batch_size = 1  #50
    filter_size = [3]
    sent_len = 40
    claim_len = 40
    cand_size = 10
    hidden_size = [300, 300]
    max_pred_pick = 5

    # model_options = locals().copy()
    # print("model options", model_options)
    # print('title2sentlist len', len(title2sentlist))
    # print('title2wordlist len', len(title2wordlist))

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    claim_idlist, claim_masklist, sent_ins_ids, sent_ins_mask, sent_cand_list = claim_input_2_theano_input(
        claim, word2id, claim_len, sent_len, cand_size, title2sentlist,
        title2wordlist)

    test_claims = np.asarray([claim_idlist], dtype='int32')
    test_claim_mask = np.asarray([claim_masklist], dtype=theano.config.floatX)

    test_sents = np.asarray([sent_ins_ids], dtype='int32')
    test_sent_masks = np.asarray([sent_ins_mask], dtype=theano.config.floatX)

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    # id2word = {y:x for x,y in word2id.items()}
    # word2vec=load_word2vec()
    # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable
    '''
    the first block for evidence identification in two classes (support & reject)
    the second block for textual entailment: given evidence labels, predict the claim labels
    '''
    sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    sents_mask = T.ftensor3()
    # sents_labels=T.imatrix() #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()

    # joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    # joint_sents_mask=T.ftensor3()
    # # joint_sents_labels=T.imatrix() #(batch, cand_size)
    # joint_claim_ids = T.imatrix() #(batch, claim_len)
    # joint_claim_mask = T.fmatrix()
    # joint_labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)

    "shared parameters"
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    "tasl 1 parameters"
    task1_att_conv_W, task1_att_conv_b = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    task1_conv_W_context, task1_conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    "task 2 parameters"
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W,
        att_conv_b, task1_conv_W_context, conv_W_context
    ]

    conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_sents,
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings = conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1),
                               cand_size,
                               axis=1)
    '''
    attentive conv for task1
    '''
    task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        embed_input_sents,  #batch_size*cand_size, emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=task1_att_conv_W,
        b=task1_att_conv_b,
        W_context=task1_conv_W_context,
        b_context=task1_conv_b_context)
    task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r

    concate_claim_sent = T.concatenate([
        batch_claim_emb, batch_sent_emb,
        T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x')
    ],
                                       axis=2)
    concate_2_matrix = concate_claim_sent.reshape(
        (batch_size * cand_size, hidden_size[0] * 2 + 1))
    "to score each evidence sentence, we use the output of attentiveConv, as well as the output of standard CNN"
    LR_input = T.concatenate([
        concate_2_matrix, task1_attentive_sent_embeddings_l,
        task1_attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2

    # LR_input = concate_2_matrix
    # LR_input_size = hidden_size[0]*2+1
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 1, LR_input_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    binarize_prob = T.where(inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size)
    sents_labels = inter_matrix * binarize_prob
    '''
    training task2, predict 3 labels
    '''
    # joint_embed_input_sents=init_embeddings[joint_sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    # joint_embed_input_claim=init_embeddings[joint_claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1)
    # joint_conv_model_sents = Conv_with_Mask(rng, input_tensor3=joint_embed_input_sents,
    #          mask_matrix = joint_sents_mask.reshape((joint_sents_mask.shape[0]*joint_sents_mask.shape[1],joint_sents_mask.shape[2])),
    #          image_shape=(batch_size*cand_size, 1, emb_size, sent_len),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    # joint_sent_embeddings=joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    # joint_batch_sent_emb = joint_sent_embeddings.reshape((batch_size, cand_size, hidden_size[0]))
    # "??? use joint_sents_labels means the evidence labels are not provided by task 1?"
    # joint_premise_emb = T.sum(joint_batch_sent_emb*joint_sents_labels.dimshuffle(0,1,'x'), axis=1) #(batch, hidden_size)

    premise_emb = T.sum(batch_sent_emb * sents_labels.dimshuffle(0, 1, 'x'),
                        axis=1)

    # joint_conv_model_claims = Conv_with_Mask(rng, input_tensor3=joint_embed_input_claim,
    #          mask_matrix = joint_claim_mask,
    #          image_shape=(batch_size, 1, emb_size, claim_len),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    # joint_claim_embeddings=joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    premise_hypo_emb = T.concatenate([premise_emb, claim_embeddings],
                                     axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    sents_tensor3 = embed_input_sents.dimshuffle(0, 2, 1).reshape(
        (batch_size, cand_size * sent_len, emb_size))
    sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(
        0, 2, 1))  #(batch_size, cand_size*sent_len, cand_size*sent_len)
    sents_dot_2_matrix = T.nnet.softmax(
        sents_dot.reshape(
            (batch_size * cand_size * sent_len, cand_size * sent_len)))
    sents_context = T.batched_dot(
        sents_dot_2_matrix.reshape(
            (batch_size, cand_size * sent_len, cand_size * sent_len)),
        sents_tensor3)  #(batch_size, cand_size*sent_len, emb_size)
    add_sents_context = embed_input_sents + sents_context.reshape(
        (batch_size * cand_size, sent_len, emb_size)
    ).dimshuffle(
        0, 2, 1
    )  #T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        add_sents_context,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape(
        (batch_size, cand_size,
         hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape(
        (batch_size, cand_size, hidden_size[0]))
    masked_sents_attconv = attentive_sent_embeddings_l * sents_labels.dimshuffle(
        0, 1, 'x')
    masked_claim_attconv = attentive_sent_embeddings_r * sents_labels.dimshuffle(
        0, 1, 'x')
    fine_max = T.concatenate([
        T.max(masked_sents_attconv, axis=1),
        T.max(masked_claim_attconv, axis=1)
    ],
                             axis=1)  #(batch, 2*hidden)
    # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([premise_hypo_emb, fine_max], axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    # joint_loss=joint_layer_LR.negative_log_likelihood(joint_labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    params = [init_embeddings] + NN_para + LR_para + joint_LR_para
    print('initialze model parameters...')
    load_model_from_file(
        '/home1/w/wenpeng/dataset/FEVER/model_para_0.9936287838053803', params)

    # train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask,joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids, sents_mask, claim_ids, claim_mask],
        [inter_matrix, binarize_prob, joint_layer_LR.y_pred],
        allow_input_downcast=True,
        on_unused_input='ignore')
    # dev_model = theano.function([sents_ids,sents_mask, claim_ids,claim_mask], [binarize_prob,joint_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print('... testing')
    # early-stopping parameters

    batch_score_vec, batch_binary_vec, pred_i = test_model(
        test_sents, test_sent_masks, test_claims, test_claim_mask)
    sorted_indices = np.argsort(batch_score_vec[0])[::-1]  #descending order
    selected_sents = []
    for index in sorted_indices:
        if batch_binary_vec[0][index] == 1:
            selected_sents.append(sent_cand_list[index])
            if len(selected_sents) == 5:
                break

    # for i, indicator in enumerate(list(batch_binary_vec[0])):
    #     if indicator == 1:
    #         selected_sents.append(sent_cand_list[i])
    return pred_id2label.get(
        pred_i[0]) + '"<p>"' + '"<br />"'.join(selected_sents) + '"<p/>"'
예제 #4
0
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=10, hidden_size=10,
                    L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=0.217545454546):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit)
    train_size=len(train_para_list)
    if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist= load_dev_or_test(word2id, para_len_limit, q_len_limit)
    test_size=len(test_para_list)
    if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)


    


    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
#     id2word = {y:x for x,y in overall_word2id.iteritems()}
#     word2vec=load_word2vec()
#     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')   
    questions = T.imatrix('questions')  
    labels = T.imatrix('labels')
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')
    extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3)


    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    
    norm_extraF=normalize_matrix(extraF)

    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] 

    UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size)
    Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] 

    W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size+3) # 3 extra features
    LR_b = theano.shared(value=numpy.zeros((2,),
                                                 dtype=theano.config.floatX),  # @UndefinedVariable
                               name='LR_b', borrow=True)
     
    attention_paras=[W_a1, W_a2, U_a, LR_b]  
    params = [embeddings]+paragraph_para+Q_para+attention_paras
    
    load_model_from_file(rootPath+'Best_Paras_conv_0.217545454545', params)
    
    paragraph_input = embeddings[paragraph.flatten()].reshape((paragraph.shape[0], paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen)
    concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1)


    paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    para_reps=paragraph_model.output_tensor #(batch, emb, para_len)

#     #LSTM
#     fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
#     paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
#     para_reps=paragraph_model.output_tensor
 
    Qs_emb = embeddings[questions.flatten()].reshape((questions.shape[0], questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength)

    questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b)
#     questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size)
    questions_reps_tensor=questions_model.output_tensor
    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)
    
#     #LSTM for questions
#     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
#     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
#     questions_reps_tensor=questions_model.output_tensor
        
#use CNN for question modeling
#     Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen)
#     conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5))
#     Q_conv_para=[conv_W, conv_b]
#     conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4,
#             image_shape=(batch_size, 1, emb_size, q_len_limit),
#             filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b)
#     conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1)
#     gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1))
#     masked_conv_output=conv_output*gru_mask
#     questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size))





    
#     new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0)
#     ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2)
#     ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction
#     padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX)
#     ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1) 
#     ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1) 
#     ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad)
    
    
    #
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (hidden, len)
        transpose_para_matrix=para_matrix.T
        interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len)
        norm_interaction_matrix=T.nnet.softmax(interaction_matrix)
        return T.dot(q_matrix, norm_interaction_matrix.T) #(len, para_len)
    batch_q_reps, updates = theano.scan(fn=example_in_batch,
                                   outputs_info=None,
                                   sequences=[para_reps, questions_reps_tensor])    #batch_q_reps (batch, hidden, para_len)
    
       
    #attention distributions
  
    norm_W_a1=normalize_matrix(W_a1)
    norm_W_a2=normalize_matrix(W_a2)
    norm_U_a=normalize_matrix(U_a)


    
    transformed_para_reps=T.maximum(T.dot(para_reps.transpose((0, 2,1)), norm_W_a2),0.0)   #relu
    transformed_q_reps=T.maximum(T.dot(batch_q_reps.transpose((0, 2,1)), norm_W_a1),0.0)
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)    
    
    add_both=transformed_para_reps+transformed_q_reps

#     U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size)
#     U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size)
#     accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b] 
#     accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b)    
#     accu_both=accumu_model.output_tensor.transpose((0,2,1))
    
    prior_att=T.concatenate([add_both, norm_extraF], axis=2)
    
    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices=para_mask.flatten().nonzero()[0]
    
    layer3=LogisticRegression(rng, input=prior_att.reshape((batch_size*prior_att.shape[1], hidden_size+3)), n_in=hidden_size+3, n_out=2, W=norm_U_a, b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.sum(T.log(layer3.p_y_given_x)[valid_indices, labels.flatten()[valid_indices]])#[T.arange(y.shape[0]), y])

    distributions=layer3.p_y_given_x[:,-1].reshape((batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
#     masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask
    masked_dis=distributions*para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()
    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''
   
#     def AttentionLayer(q_rep, ext_M):
#         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
#         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
#        f __name__ == '__main__': 
#         prior_att=T.concatenate([prior_att, ext_M], axis=1)
#                               
#         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
#         return strength.transpose() #(1, #words)
 
#     distributions, updates = theano.scan(
#     AttentionLayer,
#     sequences=[questions_reps,extraF] )
    
#     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
#     labels=debug_print(labels, 'labels')
#     label_mask=T.gt(labels,0.0)
#     neg_label_mask=T.lt(labels,0.0)
#     dis_masked=distributions*label_mask
#     remain_dis_masked=distributions*neg_label_mask
#     pos_error=((dis_masked-1)**2).mean()
#     neg_error=((remain_dis_masked-(-1))**2).mean()
#     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error
    


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    
    L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=error#+ConvGRU_1.error#
    
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
        
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
  
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))    



    train_model = theano.function([paragraph, questions,labels, para_mask, q_mask, extraF], cost, updates=updates,on_unused_input='ignore')
    
    test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False
    

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[test_size-batch_size]

        
    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        
        random.shuffle(train_ids)
        iter_accu=0
        for para_id in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
#             haha=para_mask[para_id:para_id+batch_size]
#             print haha
#             for i in range(batch_size):
#                 print len(haha[i])
            cost_i+= train_model(
                                np.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), 
                                      np.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), 
                                      np.asarray([train_label_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'), 
                                      np.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      np.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      np.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX))

            #print iter
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                  
                exact_match=0.0
                F1_match=0.0
                q_amount=0
                for test_para_id in test_batch_start:
                    distribution_matrix=test_model(
                                        np.asarray(test_para_list[test_para_id:test_para_id+batch_size], dtype='int32'), 
                                              np.asarray(test_Q_list[test_para_id:test_para_id+batch_size], dtype='int32'), 
                                              np.asarray(test_para_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX),
                                              np.asarray(test_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX),
                                              np.asarray(test_feature_matrixlist[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX))
                    
#                     print distribution_matrix
                    test_para_wordlist_list=test_text_list[test_para_id:test_para_id+batch_size]
                    para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+batch_size]
                    paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size]
                    sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size]
                    para_len=len(test_para_wordlist_list[0])
                    if para_len!=len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount+=batch_size
#                     print q_size
#                     print test_para_word_list
                    
                    Q_list_inword=test_Q_list_word[test_para_id:test_para_id+batch_size]
                    for q in range(batch_size): #for each question
#                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
#                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
#                         else:
#                             ss=len(distribution_matrix[q])
#                             combine_list=[]
#                             for ii in range(ss):
#                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
#                             print combine_list
#                         exit(0)
#                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q])
                        q_gold_ans_set=para_gold_ansset_list[q]
#                         print test_para_wordlist_list[q]
#                         print Q_list_inword[q]
#                         print pred_ans.encode('utf8'), q_gold_ans_set
                        if pred_ans in q_gold_ans_set:
                            exact_match+=1
                        F1=MacroF1(pred_ans, q_gold_ans_set)
                        F1_match+=F1
#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                F1_acc=F1_match/q_amount
                exact_acc=exact_match/q_amount
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(rootPath+'Best_Paras_conv_'+str(max_exact_acc), params)
                        print 'Finished storing best  params at:', max_exact_acc  
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc
                        



            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(term1_str, term2_str):
    emb_size=300
    filter_size=[3,3]
    maxSentLen=40
    hidden_size=[300,300]
    max_term_len=4
    p_mode = 'conc'
    batch_size = 1

    term1_def, source1 = load_concept_def(term1_str)
    print '\n',term1_str, ':\t', term1_def,'\t', source1,'\n'
    term2_def, source2 = load_concept_def(term2_str)
    print '\n',term2_str, ':\t', term2_def, '\t', source2,'\n'
    # exit(0)

    word2id = load_word2id('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_word2id.pkl')
    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results

    # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id  =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len)  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id = load_EVAlution_hyper_vs_all_with_words(maxSentLen, word2id, wordlen=max_term_len)
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask, test_extra, word2id = parse_individual_termPair(term1_str, term2_str, term1_def, term2_def, maxSentLen, word2id, wordlen=max_term_len)
    # total_size = len(all_sentences_l)
    # hold_test_size = 10000
    # train_size = total_size - hold_test_size



    # train_sents_l=np.asarray(all_sentences_l[:train_size], dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32')
    test_sents_l=np.asarray(test_sents_l, dtype='int32')

    # train_masks_l=np.asarray(all_masks_l[:train_size], dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX)
    test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX)

    # train_sents_r=np.asarray(all_sentences_r[:train_size], dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32')
    test_sents_r=np.asarray(test_sents_r, dtype='int32')

    # train_masks_r=np.asarray(all_masks_r[:train_size], dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX)
    test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX)

    # train_word1=np.asarray(all_word1[:train_size], dtype='int32')
    # train_word2=np.asarray(all_word2[:train_size], dtype='int32')
    test_word1=np.asarray(test_word1, dtype='int32')
    test_word2=np.asarray(test_word2, dtype='int32')

    # train_word1_mask=np.asarray(all_word1_mask[:train_size], dtype=theano.config.floatX)
    # train_word2_mask=np.asarray(all_word2_mask[:train_size], dtype=theano.config.floatX)
    test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX)

    # train_labels_store=np.asarray(all_labels[:train_size], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32')
    # test_labels_store=np.asarray(test_labels, dtype='int32')

    # train_extra=np.asarray(all_extra[:train_size], dtype=theano.config.floatX)
    test_extra=np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size=len(test_extra)
    print ' test size: ', len(test_extra)

    vocab_size=len(word2id)+1


    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    # id2word = {y:x for x,y in word2id.iteritems()}
    # word2vec=load_word2vec()
    # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
    # store_model_to_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_embeddings', [init_embeddings])
    # exit(0)
    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    # labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1)
    word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1)


    '''create_AttentiveConv_params '''
    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]))
    conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para=[conv_W, conv_b,conv_W_context]

    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = word1_mask,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1)

    term_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_input_r,
             mask_matrix = word1_mask,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1)


    def_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = sents_mask_l,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1)


    def_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_input_r,
             mask_matrix = sents_mask_l,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1)

    if p_mode == 'conc':
        p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4*4*hidden_size[1]
    else:
        p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0)
        p_len =4*hidden_size[1]
    "form input to LR classifier"
    LR_input = T.concatenate([p,extra.dimshuffle(0,'x')],axis=1)
    LR_input_size=p_len+1

    U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a, LR_b]


    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector



    params = NN_para+LR_para #[init_embeddings]
    load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_embeddings', [init_embeddings])

    load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params)

    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... testing'


    n_test_batches=test_size/batch_size
    n_test_remain = test_size%batch_size
    if n_test_remain!=0:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]
    else:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)



    # max_acc_dev=0.0
    # max_ap_test=0.0
    # max_ap_topk_test=0.0
    # max_f1=0.0

    # cost_i=0.0
    # train_indices = range(train_size)


    for idd, test_batch_id in enumerate(test_batch_start): # for each test batch
        pred_i, prob_i=test_model(
                test_sents_l[test_batch_id:test_batch_id+batch_size],
                test_masks_l[test_batch_id:test_batch_id+batch_size],
                test_sents_r[test_batch_id:test_batch_id+batch_size],
                test_masks_r[test_batch_id:test_batch_id+batch_size],
                test_word1[test_batch_id:test_batch_id+batch_size],
                test_word2[test_batch_id:test_batch_id+batch_size],
                test_word1_mask[test_batch_id:test_batch_id+batch_size],
                test_word2_mask[test_batch_id:test_batch_id+batch_size],
                test_extra[test_batch_id:test_batch_id+batch_size])
        print pred_i, prob_i
예제 #6
0
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, batch_size=20, test_batch_size=200, emb_size=300, hidden_size=300,
                    L2_weight=0.0001, para_len_limit=400, q_len_limit=40, max_EM=50.302743615):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)
    

#     glove_vocab=set(word2vec.keys())
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit)
    train_size=len(train_para_list)
    if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)
    
    test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test(word2id, para_len_limit, q_len_limit)
    test_size=len(test_para_list)
    if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)





    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
#     id2word = {y:x for x,y in overall_word2id.iteritems()}
#     word2vec=load_glove()
#     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
#     labels = T.imatrix('labels')  #(batch, para_len)
    gold_indices= T.ivector() #batch
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')
    extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3)
    is_train = T.iscalar()


    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size=paragraph.shape[0]

    norm_extraF=normalize_matrix(extraF)

    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b]

    U_e1, W_e1, b_e1=create_GRU_para(rng, 3*hidden_size+3, hidden_size)
    U_e1_b, W_e1_b, b_e1_b=create_GRU_para(rng, 3*hidden_size+3, hidden_size)
    paragraph_para_e1=[U_e1, W_e1, b_e1, U_e1_b, W_e1_b, b_e1_b]


    UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size)
    Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b]

#     W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
#     W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 1, 2*hidden_size) # 3 extra features
#     LR_b = theano.shared(value=numpy.zeros((2,),
#                                                  dtype=theano.config.floatX),  # @UndefinedVariable
#                                name='LR_b', borrow=True)

    HL_paras=[U_a]
    params = [embeddings]+paragraph_para+Q_para+paragraph_para_e1+HL_paras

    load_model_from_file(rootPath+'Best_Paras_conv_50.302743614', params)

    paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen)
    concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1)

    paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    para_reps=paragraph_model.output_tensor #(batch, emb, para_len)

#     #LSTM
#     fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
#     paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
#     para_reps=paragraph_model.output_tensor

    Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength)
    questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b)
    questions_reps_tensor=questions_model.output_tensor
    questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden)
    questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)  #(batch, para_len, hidden)

#     #LSTM for questions
#     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
#     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
#     questions_reps_tensor=questions_model.output_tensor






    #
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (hidden, len)
        transpose_para_matrix=para_matrix.T
        interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len)
        norm_interaction_matrix=T.nnet.softmax(interaction_matrix)
#         norm_interaction_matrix=T.maximum(0.0, interaction_matrix)
        return T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(len, para_len)
    batch_q_reps, updates = theano.scan(fn=example_in_batch,
                                   outputs_info=None,
                                   sequences=[para_reps, questions_reps_tensor])    #batch_q_reps (batch, hidden, para_len)



    #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len)
    ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), norm_extraF.dimshuffle(0,2,1)], axis=1) #(batch, 3*hidden+3, para_len)

    
    para_ensemble_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,U=U_e1,W=W_e1,b=b_e1,Ub=U_e1_b,Wb=W_e1_b,bb=b_e1_b)
    para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, hidden ,para_len)
    para_reps_tensor4score = dropout_standard(is_train, para_reps_tensor4score, 0.2, rng)
    #for span reps
    span_1=T.concatenate([para_reps_tensor4score, para_reps_tensor4score], axis=1) #(batch, 2*hidden ,para_len)
    span_2=T.concatenate([para_reps_tensor4score[:,:,:-1], para_reps_tensor4score[:,:,1:]], axis=1) #(batch, 2*hidden ,para_len-1)
    span_3=T.concatenate([para_reps_tensor4score[:,:,:-2], para_reps_tensor4score[:,:,2:]], axis=1) #(batch, 2*hidden ,para_len-2)
    span_4=T.concatenate([para_reps_tensor4score[:,:,:-3], para_reps_tensor4score[:,:,3:]], axis=1) #(batch, 2*hidden ,para_len-3)
    span_5=T.concatenate([para_reps_tensor4score[:,:,:-4], para_reps_tensor4score[:,:,4:]], axis=1) #(batch, 2*hidden ,para_len-4)
    span_6=T.concatenate([para_reps_tensor4score[:,:,:-5], para_reps_tensor4score[:,:,5:]], axis=1) #(batch, 2*hidden ,para_len-5)
    span_7=T.concatenate([para_reps_tensor4score[:,:,:-6], para_reps_tensor4score[:,:,6:]], axis=1) #(batch, 2*hidden ,para_len-6)

    span_8=T.concatenate([para_reps_tensor4score[:,:,:-7], para_reps_tensor4score[:,:,7:]], axis=1) #(batch, 2*hidden ,para_len-7)
    span_9=T.concatenate([para_reps_tensor4score[:,:,:-8], para_reps_tensor4score[:,:,8:]], axis=1) #(batch, 2*hidden ,para_len-8)
    span_10=T.concatenate([para_reps_tensor4score[:,:,:-9], para_reps_tensor4score[:,:,9:]], axis=1) #(batch, 2*hidden ,para_len-9)
    span_11=T.concatenate([para_reps_tensor4score[:,:,:-10], para_reps_tensor4score[:,:,10:]], axis=1) #(batch, 2*hidden ,para_len-10)
    span_12=T.concatenate([para_reps_tensor4score[:,:,:-11], para_reps_tensor4score[:,:,11:]], axis=1) #(batch, 2*hidden ,para_len-11)
    span_13=T.concatenate([para_reps_tensor4score[:,:,:-12], para_reps_tensor4score[:,:,12:]], axis=1) #(batch, 2*hidden ,para_len-12)

    span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7,
                             span_8, span_9, span_10, span_11, span_12, span_13], axis=2) #(batch, 2*hidden, 13*para_len-78)
    test_span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7], axis=2) #(batch, 2*hidden, 5*para_len-10)  #, span_6, span_7
    #score each span reps
    norm_U_a=normalize_matrix(U_a)
    span_scores_tensor=T.dot(span_reps.dimshuffle(0,2,1), norm_U_a)  #(batch, 13*para_len-78, 1)
    span_scores=T.nnet.softmax(span_scores_tensor.reshape((true_batch_size, 13*paragraph.shape[1]-78))) #(batch, 7*para_len-21)
    loss=-T.sum(T.log(span_scores[T.arange(true_batch_size), gold_indices]))
    
    test_span_scores_tensor=T.dot(test_span_reps.dimshuffle(0,2,1), norm_U_a)  #(batch, 7*para_len-21, 1)
    test_span_scores=T.nnet.softmax(test_span_scores_tensor.reshape((true_batch_size, 7*paragraph.shape[1]-21))) #(batch, 7*para_len-21)    
    test_return=T.argmax(test_span_scores, axis=1) #batch


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
#     L2_reg = L2norm_paraList([embeddings])
    cost=loss#+ConvGRU_1.error#


    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
   
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
   
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask, extraF, is_train], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF, is_train], test_return, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/test_batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size]


    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
#             haha=para_mask[para_id:para_id+batch_size]
#             print haha
#             for i in range(batch_size):
#                 print len(haha[i])
            cost_i+= train_model(
                                numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_label_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      1)

            #print iter
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
#                 writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8')
#                 writefile.write('{')
                pred_dict={}
#                 exact_match=0.0
#                 F1_match=0.0
                q_amount=0
                for test_para_id in test_batch_start:
                    batch_predict_ids=test_model(
                                        numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                              numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                              numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                              numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                              numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                              0)

#                     print distribution_matrix
                    test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size]
#                     para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size]
                    q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size]
#                     print 'q_ids_batch:', q_ids_batch
                    # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size]
                    # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size]
                    # para_len=len(test_para_wordlist_list[0])
                    # if para_len!=len(distribution_matrix[0]):
                    #     print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0])
                    #     exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount+=test_batch_size
#                     print q_size
#                     print test_para_word_list

#                     Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size]
                    for q in range(test_batch_size): #for each question
#                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
#                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
#                         else:
#                             ss=len(distribution_matrix[q])
#                             combine_list=[]
#                             for ii in range(ss):
#                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
#                             print combine_list
#                         exit(0)
#                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_list[q])
                        q_id=q_ids_batch[q]
                        pred_dict[q_id]=pred_ans
#                         writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ')
                        # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q])
#                         q_gold_ans_set=para_gold_ansset_list[q]
# #                         print test_para_wordlist_list[q]
# #                         print Q_list_inword[q]
# #                         print pred_ans.encode('utf8'), q_gold_ans_set
#                         if pred_ans in q_gold_ans_set:
#                             exact_match+=1
#                         F1=MacroF1(pred_ans, q_gold_ans_set)
#                         F1_match+=F1
                with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt')
#                 F1_acc=F1_match/q_amount
#                 exact_acc=exact_match/q_amount
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(rootPath+'Best_Paras_conv_'+str(max_exact_acc), params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc
                
                
#                 os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt')




            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
예제 #7
0
def evaluate_lenet5(learning_rate=0.08, n_epochs=2000, nkerns=[50], batch_size=1000, window_width=4,
                    maxSentLength=64, emb_size=50, hidden_size=50,
                    margin=0.5, L2_weight=0.0004, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=483142, comment='v5_margin0.6_neg300_'):
    maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/FB15k/'
    rng = numpy.random.RandomState(1234)
    triples, entity_size, relation_size, train_triples_set, train_entity_set, train_relation_set,dev_triples, dev_triples_set, dev_entity_set, dev_relation_set, test_triples, test_triples_set, test_entity_set, test_relation_set=load_TrainDevTest_triples_RankingLoss(triple_path+'freebase_mtr100_mte100-train.txt',triple_path+'freebase_mtr100_mte100-valid.txt', triple_path+'freebase_mtr100_mte100-test.txt', line_no, triple_path)
    
    
    print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count)
    dev_size=len(dev_triples)
    print 'dev triple size:', dev_size, 'entity_size:', len(dev_entity_set)
    test_size=len(test_triples)
    print 'test triple size:', test_size, 'entity_size:', len(test_entity_set)
#     print triples
#     print entity_count
#     print relation_count
#     exit(0)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
#     mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
#     mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt')
#     wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')

    
#     entity_count=theano.shared(numpy.asarray(entity_count, dtype=theano.config.floatX), borrow=True)
#     entity_count=T.cast(entity_count, 'int64')
#     relation_count=theano.shared(numpy.asarray(relation_count, dtype=theano.config.floatX), borrow=True)
#     relation_count=T.cast(relation_count, 'int64')    


    rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    entity_E=theano.shared(value=rand_values, borrow=True)      
    rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
    relation_E=theano.shared(value=rand_values, borrow=True)    
    
    GRU_U, GRU_W, GRU_b=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=3) 
    
    para_to_load=[entity_E, relation_E, GRU_U, GRU_W, GRU_b]
    load_model_from_file(triple_path+comment+'Best_Paras_dim'+str(emb_size), para_to_load)
    norm_entity_E=norm_matrix(entity_E)
    norm_relation_E=norm_matrix(relation_E)
    
    n_batchs=line_no/batch_size
    remain_triples=line_no%batch_size
    if remain_triples>0:
        batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size]
    else:
        batch_start=list(numpy.arange(n_batchs)*batch_size)
    batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True)
    batch_start=T.cast(batch_start, 'int64')   
    

    test_triple = T.lvector('test_triple')  
    neg_inds = T.lvector('neg_inds')

    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    predicted_tail=GRU_Combine_2Vector(norm_entity_E[test_triple[0]], norm_relation_E[test_triple[1]], emb_size, GRU_U, GRU_W, GRU_b)
    golden_tail=norm_entity_E[test_triple[2]]
    pos_loss=(1-cosine(predicted_tail,golden_tail))**2
    neg_Es=norm_entity_E[neg_inds].reshape((neg_inds.shape[0], emb_size))
    predicted_tail=predicted_tail.reshape((1, emb_size))
    multi=T.sum(predicted_tail*neg_Es, axis=1)
    len1=T.sqrt(T.sum(predicted_tail**2))
    len2=T.sqrt(T.sum(neg_Es**2, axis=1))
    cos=multi/(len1*len2)
    neg_loss_vector=(1-cos)**2

#     normed_predicted_tail=predicted_tail/T.sqrt(T.sum(predicted_tail**2))
#     
#     pos_loss=T.sum(abs(normed_predicted_tail-golden_tail))
#     neg_Es=norm_entity_E[neg_inds].reshape((neg_inds.shape[0], emb_size))
#     predicted_tail=normed_predicted_tail.reshape((1, emb_size))
# 
#     neg_loss_vector=T.sum(abs(predicted_tail-neg_Es), axis=1)
   
    
    
    
    GRU_forward_step = theano.function([test_triple, neg_inds], [pos_loss,neg_loss_vector], on_unused_input='ignore')
    
# 
#     train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
#           givens={
#             x_index_l: indices_train_l[index: index + batch_size],
#             x_index_r: indices_train_r[index: index + batch_size],
#             y: trainY[index: index + batch_size],
#             left_l: trainLeftPad_l[index],
#             right_l: trainRightPad_l[index],
#             left_r: trainLeftPad_r[index],
#             right_r: trainRightPad_r[index],
#             length_l: trainLengths_l[index],
#             length_r: trainLengths_r[index],
#             norm_length_l: normalized_train_length_l[index],
#             norm_length_r: normalized_train_length_r[index],
#             mts: mt_train[index: index + batch_size],
#             wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
#     validation_frequency = min(n_train_batches/5, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0
    corpus_triples_set=train_triples_set|dev_triples_set|test_triples_set
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
#         cost_1, cost_l= train_model(triples)
#                 #print 'layer3_input', layer3_input
#         print 'cost:', cost_1, cost_l
        
        #test
        test_size=len(test_triples)
        hits_10=test_size
        hits_1=test_size
        
        co=0
        for test_triple in test_triples:
            co+=1

            count=0
            flag_continue=True
            nega_entity_set=get_negas(test_triple, corpus_triples_set, test_entity_set)
#             print len(nega_entity_set)
            p_loss, n_loss_vector=GRU_forward_step(test_triple, list(nega_entity_set))

            n_loss_vector=numpy.sort(n_loss_vector)
#             print p_loss
#             print n_loss_vector[:15]
#             exit(0)
            if p_loss>n_loss_vector[0]:
                hits_1-=1
            if p_loss>n_loss_vector[9]:
                hits_10-=1 
            if co%1000==0:
                print co, '...'
                print '\t\thits_10', hits_10*100.0/test_size, 'hits_1', hits_1*100.0/test_size
        hits_10=hits_10*100.0/test_size
        hits_1=hits_1*100.0/test_size
        
        
#             if patience <= iter:
#                 done_looping = True
#                 break
        #after each epoch, increase the batch_size
        store_model_to_file(triple_path+'Best_Paras_dim'+str(emb_size)+'_hits10_'+str(hits_10)[:6], para_to_load)
        print 'Finished storing best  params'
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min, Hits_10:',  hits_10, 'Hits_1:,', hits_1
        mid_time = time.clock()
        exit(0)
#         exit(0)
        
#         #store the paras after epoch 15
#         if epoch ==22:
#             store_model_to_file(params_conv)
#             print 'Finished storing best conv params'
#             exit(0)
            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.001, n_epochs=2000, batch_size=1000, emb_size=50,
                    margin=0.6, L2_weight=1e-10, update_freq=1, norm_threshold=5.0, max_truncate=40, line_no=483142, neg_size=60, test_neg_size=300,
                    comment='v_reverse_'):#L1Distance_
    model_options = locals().copy()
    print "model options", model_options
    triple_path='/mounts/data/proj/wenpeng/Dataset/freebase/FB15k/'
    rng = numpy.random.RandomState(1234)
#     triples, entity_size, relation_size, entity_count, relation_count=load_triples(triple_path+'freebase_mtr100_mte100-train.txt', line_no, triple_path)#vocab_size contain train, dev and test
    triples, entity_size, relation_size, train_triples_set, train_entity_set, train_relation_set,dev_triples, dev_triples_set, dev_entity_set, dev_relation_set, test_triples, test_triples_set, test_entity_set, test_relation_set, statistics=load_TrainDevTest_triples_RankingLoss_EntityRelationNeighbors(triple_path+'freebase_mtr100_mte100-train.txt',triple_path+'freebase_mtr100_mte100-valid.txt', triple_path+'freebase_mtr100_mte100-test.txt', line_no, triple_path)
    train_h2t=statistics[0]
    train_t2h=statistics[1]
    train_r2t=statistics[2]
    train_r2h=statistics[3]
    train_r_replace_tail_prop=statistics[4]
    
    print 'triple size:', len(triples), 'entity_size:', entity_size, 'relation_size:', relation_size#, len(entity_count), len(relation_count)
    dev_size=len(dev_triples)
    print 'dev triple size:', dev_size, 'entity_size:', len(dev_entity_set)
    test_size=len(test_triples)
    print 'test triple size:', test_size, 'entity_size:', len(test_entity_set)
    
#     print triples
#     print entity_count
#     print relation_count
#     exit(0)
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
#     mtPath='/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
#     mt_train, mt_test=load_mts_wikiQA(mtPath+'result_train/concate_2mt_train.txt', mtPath+'result_test/concate_2mt_test.txt')
#     wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores.txt', rootPath+'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')


#     entity_count=theano.shared(numpy.asarray(entity_count, dtype=theano.config.floatX), borrow=True)
#     entity_count=T.cast(entity_count, 'int64')
#     relation_count=theano.shared(numpy.asarray(relation_count, dtype=theano.config.floatX), borrow=True)
#     relation_count=T.cast(relation_count, 'int64')    


    rand_values=random_value_normal((entity_size, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    entity_E=theano.shared(value=rand_values, borrow=True)      
    rand_values=random_value_normal((relation_size, emb_size), theano.config.floatX, numpy.random.RandomState(4321))
    relation_E=theano.shared(value=rand_values, borrow=True)    
    
    GRU_U0, GRU_W0, GRU_b0=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
    GRU_U1, GRU_W1, GRU_b1=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
    GRU_U2, GRU_W2, GRU_b2=create_GRU_para(rng, word_dim=emb_size, hidden_dim=emb_size)  
#     GRU_U_combine, GRU_W_combine, GRU_b_combine=create_nGRUs_para(rng, word_dim=emb_size, hidden_dim=emb_size, n=3) 
    para_to_load=[entity_E, relation_E, GRU_U0, GRU_W0, GRU_b0]
    load_model_from_file(triple_path+'Best_Paras_dim'+str(emb_size)+'_hits10_63.469', para_to_load)  #+'_hits10_63.350'
    GRU_U_combine=[GRU_U0, GRU_U1, GRU_U2]
    GRU_W_combine=[GRU_W0, GRU_W1, GRU_W2]
    GRU_b_combine=[GRU_b0, GRU_b1, GRU_b2]

    norm_entity_E=norm_matrix(entity_E)
    norm_relation_E=norm_matrix(relation_E)
    
        
    n_batchs=line_no/batch_size
    remain_triples=line_no%batch_size
    if remain_triples>0:
        batch_start=list(numpy.arange(n_batchs)*batch_size)+[line_no-batch_size]
    else:
        batch_start=list(numpy.arange(n_batchs)*batch_size)

    n_batchs_test=test_size/batch_size
    remain_triples_test=test_size%batch_size
    if remain_triples_test>0:
        batch_start_test=list(numpy.arange(n_batchs_test)*batch_size)+[test_size-batch_size]
    else:
        batch_start_test=list(numpy.arange(n_batchs_test)*batch_size)
#     batch_start=theano.shared(numpy.asarray(batch_start, dtype=theano.config.floatX), borrow=True)
#     batch_start=T.cast(batch_start, 'int64')   
    
    # allocate symbolic variables for the data
#     index = T.lscalar()
    x_index_l = T.lmatrix('x_index_l')   # now, x is the index matrix, must be integer
    n_index_T = T.ltensor3('n_index_T')
#     x_index_r = T.imatrix('x_index_r')
#     y = T.ivector('y')  
#     left_l=T.iscalar()
#     right_l=T.iscalar()
#     left_r=T.iscalar()
#     right_r=T.iscalar()
#     length_l=T.iscalar()
#     length_r=T.iscalar()
#     norm_length_l=T.fscalar()
#     norm_length_r=T.fscalar()
#     mts=T.fmatrix()
#     wmf=T.fmatrix()
#     cost_tmp=T.fscalar()
#     #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
#     ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
#     filter_size=(emb_size,window_width)
#     #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    
    dist_tail, dist_relation, dist_head=one_batch_parallel_Ramesh(x_index_l, norm_entity_E, norm_relation_E, GRU_U_combine, GRU_W_combine, GRU_b_combine, emb_size)
    
    
    loss__tail_is, loss_relation_is, loss_head_is=one_neg_batches_parallel_Ramesh(n_index_T, norm_entity_E, norm_relation_E, GRU_U_combine, GRU_W_combine, GRU_b_combine, emb_size)
    loss_tail_i=T.maximum(0.0, margin+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is) 
    loss_relation_i=T.maximum(0.0, margin+dist_relation.reshape((dist_relation.shape[0],1))-loss_relation_is)     
    loss_head_i=T.maximum(0.0, margin+dist_head.reshape((dist_head.shape[0],1))-loss_head_is)    
    
    
    loss_tail_i_test=T.maximum(0.0, 0.0+dist_tail.reshape((dist_tail.shape[0],1))-loss__tail_is)   
    binary_matrix_test=T.gt(loss_tail_i_test, 0)
    sum_vector_test=T.sum(binary_matrix_test, axis=1)
    binary_vector_hits10=T.gt(sum_vector_test, 10)
    test_loss=T.sum(binary_vector_hits10)*1.0/batch_size  
#     loss_relation_i=T.maximum(0.0, margin+dis_relation.reshape((dis_relation.shape[0],1))-loss__relation_is) 
#     loss_head_i=T.maximum(0.0, margin+dis_head.reshape((dis_head.shape[0],1))-loss__head_is)     
#     def neg_slice(neg_matrix):
#         dist_tail_slice, dis_relation_slice, dis_head_slice=one_batch_parallel_Ramesh(neg_matrix, entity_E, relation_E, GRU_U_combine, GRU_W_combine, GRU_b_combine, emb_size)
#         loss_tail_i=T.maximum(0.0, margin+dist_tail-dist_tail_slice) 
#         loss_relation_i=T.maximum(0.0, margin+dis_relation-dis_relation_slice) 
#         loss_head_i=T.maximum(0.0, margin+dis_head-dis_head_slice) 
#         return loss_tail_i, loss_relation_i, loss_head_i
#     
#     (loss__tail_is, loss__relation_is, loss__head_is), updates = theano.scan(
#        neg_slice,
#        sequences=n_index_T,
#        outputs_info=None)  
    
    loss_tails=T.mean(T.sum(loss_tail_i, axis=1) )
    loss_relations=T.mean(T.sum(loss_relation_i, axis=1) )
    loss_heads=T.mean(T.sum(loss_head_i, axis=1) )
    loss=loss_tails+loss_relations+loss_heads
    L2_loss=debug_print((entity_E** 2).sum()+(relation_E** 2).sum()\
                      +(GRU_U0** 2).sum()+(GRU_W0** 2).sum()+(GRU_U1** 2).sum()+(GRU_W1** 2).sum()+(GRU_U2** 2).sum()+(GRU_W2** 2).sum(), 'L2_reg')
#     Div_loss=Diversify_Reg(GRU_U[0])+Diversify_Reg(GRU_U[1])+Diversify_Reg(GRU_U[2])+\
#         Diversify_Reg(GRU_W[0])+Diversify_Reg(GRU_W[1])+Diversify_Reg(GRU_W[2])
    cost=loss+L2_weight*L2_loss#+div_reg*Div_loss
    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [entity_E, relation_E, GRU_U0, GRU_W0, GRU_b0, GRU_U1, GRU_W1, GRU_b1, GRU_U2, GRU_W2, GRU_b2]
#     params_conv = [conv_W, conv_b]
    params_to_store=[entity_E, relation_E, GRU_U0, GRU_W0, GRU_b0, GRU_U1, GRU_W1, GRU_b1, GRU_U2, GRU_W2, GRU_b2]
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
    grads = T.grad(cost, params)
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-9)))   #AdaGrad
        updates.append((acc_i, acc))    
  

#     grads = T.grad(cost, params)
#     updates = []
#     for param_i, grad_i in zip(params, grads):
#         updates.append((param_i, param_i - learning_rate * grad_i))   #AdaGrad 
        
    train_model = theano.function([x_index_l, n_index_T], [loss, cost], updates=updates,on_unused_input='ignore')
    test_model = theano.function([x_index_l, n_index_T], test_loss, on_unused_input='ignore')
# 
#     train_model_predict = theano.function([index], [cost_this,layer3.errors(y), layer3_input, y],
#           givens={
#             x_index_l: indices_train_l[index: index + batch_size],
#             x_index_r: indices_train_r[index: index + batch_size],
#             y: trainY[index: index + batch_size],
#             left_l: trainLeftPad_l[index],
#             right_l: trainRightPad_l[index],
#             left_r: trainLeftPad_r[index],
#             right_r: trainRightPad_r[index],
#             length_l: trainLengths_l[index],
#             length_r: trainLengths_r[index],
#             norm_length_l: normalized_train_length_l[index],
#             norm_length_r: normalized_train_length_r[index],
#             mts: mt_train[index: index + batch_size],
#             wmf: wm_train[index: index + batch_size]}, on_unused_input='ignore')



    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
#     validation_frequency = min(n_train_batches/5, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    svm_max=0.0
    best_epoch=0
    corpus_triples_set=train_triples_set|dev_triples_set|test_triples_set
    best_test_loss=1000000
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
#         learning_rate/=epoch
#         print 'lr:', learning_rate
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0
        #shuffle(train_batch_start)#shuffle training data
        for start in batch_start:
            if start%100000==0:
                print start, '...'
            pos_triples=triples[start:start+batch_size]
            all_negs=[]
#             count=0
            for pos_triple in pos_triples:
                neg_triples=get_n_neg_triples_train(pos_triple, train_triples_set, train_entity_set, train_r_replace_tail_prop, neg_size)
# #                 print 'neg_head_triples'
#                 neg_relation_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 1, neg_size/3)
# #                 print 'neg_relation_triples'
#                 neg_tail_triples=get_n_neg_triples(pos_triple, train_triples_set, train_entity_set, train_relation_set, 2, neg_size/3)
#                 print 'neg_tail_triples'
                all_negs.append(neg_triples)
#                 print 'neg..', count
#                 count+=1
            neg_tensor=numpy.asarray(all_negs).reshape((batch_size, neg_size, 3)).transpose(1,0,2)
            loss, cost= train_model(pos_triples, neg_tensor)
        print 'Training loss:', loss, 'cost:', cost
        
        
        loss_test=0.0

        for test_start in batch_start_test:
            pos_triples=test_triples[test_start:test_start+batch_size]
            all_negs=[]
            for pos_triple in pos_triples:
                neg_triples=get_n_neg_triples_new(pos_triple, corpus_triples_set, test_entity_set, test_relation_set, test_neg_size/2, True)
                all_negs.append(neg_triples)
                
            neg_tensor=numpy.asarray(all_negs).reshape((batch_size, test_neg_size, 3)).transpose(1,0,2)
            loss_test+= test_model(pos_triples, neg_tensor)
            
            
        loss_test/=n_batchs_test
        print '\t\t\tUpdating epoch', epoch, 'finished! Test hits10:', 1.0-loss_test
        if loss_test< best_test_loss:
            store_model_to_file(triple_path+comment+'Best_Paras_dim'+str(emb_size), params_to_store)
#             store_model_to_file(triple_path+'Divreg_Best_Paras_dim'+str(emb_size), params_to_store)
            best_test_loss=loss_test
            print 'Finished storing best  params'
#             exit(0)
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock()            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
예제 #9
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3],
                    sent_len=40,
                    claim_len=40,
                    cand_size=10,
                    hidden_size=[300, 300],
                    max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}
    root = '/save/wenpeng/datasets/FEVER/'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    vocabfile = codecs.open(root + 'word2id.txt', 'r', 'utf-8')
    word2id = json.loads(vocabfile.read())
    # co=0
    # for line in vocabfile:
    #     word2id = json.loads(line)
    #     co+=1
    # print 'co: ', co
    # word2id = json.load(open(root+'word2id.json')) #json.loads(vocabfile)
    vocabfile.close()
    print 'load word2id over'
    # train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id  = load_fever_train(sent_len, claim_len, cand_size)
    # train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(sent_len, claim_len, cand_size, word2id)
    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, _ = load_SciTailV1_dataset(
        sent_len, word2id)
    # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_labels, _ = load_RTE_dataset_as_test(sent_len, word2id)

    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l = np.asarray(all_sentences_l[2], dtype='int32')

    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l = np.asarray(all_masks_l[2], dtype=theano.config.floatX)

    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    test_sents_r = np.asarray(all_sentences_r[2], dtype='int32')

    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r = np.asarray(all_masks_r[2], dtype=theano.config.floatX)

    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    test_labels_store = np.asarray(all_labels[2], dtype='int32')

    # dev_size=len(dev_labels_store)
    test_size = len(test_labels_store)

    vocab_size = len(word2id) + 1

    print 'vocab size: ', vocab_size

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    # id2word = {y:x for x,y in word2id.iteritems()}
    # word2vec=load_word2vec()
    # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids = T.imatrix()  #(batch, cand_size, sent_len)
    sents_mask = T.fmatrix()
    # sents_labels=T.imatrix() #(batch, cand_size)
    # claim_ids = T.imatrix() #(batch, claim_len)
    # claim_mask = T.fmatrix()

    # joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    # joint_sents_mask=T.ftensor3()
    # joint_sents_labels=T.imatrix() #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()
    labels = T.ivector()

    # test_premise_ids = T.imatrix()
    # test_premise_matrix = T.fmatrix()
    # test_hypo_ids = T.imatrix()
    # test_hypo_matrix = T.fmatrix()
    # test_scitail_minibatch_labels = T.ivector()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, sent_len, emb_size)).dimshuffle(0, 2, 1)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    # task1_att_conv_W, task1_att_conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    # task1_conv_W_context, task1_conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [
        conv_W, conv_b, att_conv_W, att_conv_b, conv_W_context, conv_b_context
    ]
    '''
    training task2, predict 3 labels
    '''
    joint_embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    joint_embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, sent_len, emb_size)).dimshuffle(0, 2, 1)
    joint_conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_sents,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_premise_emb = joint_conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    # joint_batch_sent_emb = joint_sent_embeddings.reshape((batch_size, cand_size, hidden_size[0]))
    # joint_premise_emb = T.sum(joint_batch_sent_emb*joint_sents_labels.dimshuffle(0,1,'x'), axis=1) #(batch, hidden_size)

    joint_conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_claim_embeddings = joint_conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    joint_premise_hypo_emb = T.concatenate(
        [joint_premise_emb, joint_claim_embeddings],
        axis=1)  #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    # joint_sents_tensor3 = joint_embed_input_sents.dimshuffle(0,2,1).reshape((batch_size, cand_size*sent_len, emb_size))
    # joint_sents_dot = T.batched_dot(joint_sents_tensor3, joint_sents_tensor3.dimshuffle(0,2,1)) #(batch_size, cand_size*sent_len, cand_size*sent_len)
    # joint_sents_dot_2_matrix = T.nnet.softmax(joint_sents_dot.reshape((batch_size*cand_size*sent_len, cand_size*sent_len)))
    # joint_sents_context = T.batched_dot(joint_sents_dot_2_matrix.reshape((batch_size, cand_size*sent_len, cand_size*sent_len)), joint_sents_tensor3) #(batch_size, cand_size*sent_len, emb_size)
    # joint_add_sents_context = joint_embed_input_sents+joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        joint_embed_input_sents,  #batch_size*cand_size, 2*emb_size, sent_len
        input_tensor3_r=joint_embed_input_claim,
        mask_matrix=sents_mask,
        mask_matrix_r=claim_mask,
        image_shape=(batch_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([
        joint_premise_hypo_emb, attentive_sent_embeddings_l,
        attentive_sent_embeddings_r
    ],
                                   axis=1)
    joint_LR_input_size = 2 * hidden_size[0] + 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    # joint_loss=joint_layer_LR.negative_log_likelihood(joint_labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    joint_sents_mask=T.ftensor3()
    joint_sents_labels=T.imatrix() #(batch, cand_size)
    joint_claim_ids = T.imatrix() #(batch, claim_len)
    joint_claim_mask = T.fmatrix()
    joint_labels=T.ivector()
    '''
    pred_minibatch_labels = joint_layer_LR.y_pred
    pred_minibatch_labels_2_2classes = T.where(pred_minibatch_labels > 1, 0,
                                               pred_minibatch_labels)

    pred_minibatch_error = T.mean(
        T.neq(pred_minibatch_labels_2_2classes, labels))

    params = [init_embeddings] + NN_para + joint_LR_para
    load_model_from_file(root + 'para_for_test_scitail', params)

    # train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask,joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function(
        [sents_ids, sents_mask, claim_ids, claim_mask, labels],
        pred_minibatch_error,
        allow_input_downcast=True,
        on_unused_input='ignore')
    # dev_model = theano.function([joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], pred_minibatch_error, allow_input_downcast=True, on_unused_input='ignore')

    # test_model = theano.function([sents_ids,sents_mask,sents_labels, claim_ids,claim_mask, joint_labels], [inter_matrix,test_layer_LR.errors(joint_labels), test_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore')
    # dev_model = theano.function([sents_ids,sents_mask,sents_labels, claim_ids,claim_mask, joint_labels], [inter_matrix,test_layer_LR.errors(joint_labels), test_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... testing'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    # joint_n_train_batches=joint_train_size/batch_size
    # joint_train_batch_start=list(np.arange(joint_n_train_batches)*batch_size)+[joint_train_size-batch_size]
    # n_train_batches=train_size/batch_size
    # train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]

    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    remain_test_batches = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_acc_dev = 0.0
    max_acc_test = 0.0

    cost_i = 0.0

    error_sum = 0.0
    for idd, test_batch_id in enumerate(
            test_batch_start):  # for each test batch
        error_i = test_model(
            test_sents_l[test_batch_id:test_batch_id + batch_size],
            test_masks_l[test_batch_id:test_batch_id + batch_size],
            test_sents_r[test_batch_id:test_batch_id + batch_size],
            test_masks_r[test_batch_id:test_batch_id + batch_size],
            test_labels_store[test_batch_id:test_batch_id + batch_size])
        error_sum += error_i
    test_acc = 1.0 - error_sum / (len(test_batch_start))
    print '\tcurrent test_acc:', test_acc
def evaluate_lenet5(learning_rate=0.001, n_epochs=2000, batch_size=500, test_batch_size=1000, emb_size=300, hidden_size=300, HL_hidden_size=200,
                    L2_weight=0.0001, train_size=None, test_size=None, batch_size_pred=1000,
                    para_len=60, question_len=20, c_len=7, e_len=2):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/work/hs/yin/20161219/';
    storePath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = np.random.RandomState(23455)
    
    word2id={}
    word2id['UNK']=0 # use it to pad 
    word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_e_ids,train_e_masks,train_c_ids,train_c_masks,train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len, e_len, c_len, word2id, rootPath+'squadnewtrn.txt')
    word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_e_ids,test_e_masks,test_c_ids,test_c_masks, test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, e_len, c_len, word2id, rootPath+'squadnewdev.txt')

    print 'word2id size for bigger dataset:', len(word2id)
#     word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len, word2id, rootPath+'squadnewtrn,subset.txt')
#     word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, word2id, rootPath+'squadnewdev,subset.txt')
#     
    print 'word2id size for smaller dataset:', len(word2id)
#     if len(train_questions)!=train_size or len(test_questions)!=test_size:
#         print 'len(questions)!=train_size or len(test_questions)!=test_size:', len(train_questions),train_size,len(test_questions),test_size
#         exit(0)
    train_size=len(train_questions)
    test_size = len(test_questions)
    
    train_questions = np.asarray(train_questions, dtype='int32')
    
#     print train_questions[:10,:]
#     exit(0)
    train_questions_mask = np.asarray(train_questions_mask, dtype=theano.config.floatX)
    train_paras = np.asarray(train_paras, dtype='int32')
    train_paras_mask = np.asarray(train_paras_mask, dtype=theano.config.floatX)

    train_e_ids = np.asarray(train_e_ids, dtype='int32')
    train_e_masks = np.asarray(train_e_masks, dtype=theano.config.floatX)
    train_c_ids = np.asarray(train_c_ids, dtype='int32')
    train_c_masks = np.asarray(train_c_masks, dtype=theano.config.floatX)

    train_c_heads = np.asarray(train_c_heads, dtype='int32')
    train_c_tails = np.asarray(train_c_tails, dtype='int32')
    train_l_heads = np.asarray(train_l_heads, dtype='int32')
    train_l_tails = np.asarray(train_l_tails, dtype='int32')
    train_e_heads = np.asarray(train_e_heads, dtype='int32')
    train_e_tails = np.asarray(train_e_tails, dtype='int32')
    train_labels = np.asarray(train_labels, dtype='int32')
    train_labels_3c = np.asarray(train_labels_3c, dtype='int32')

    test_questions = np.asarray(test_questions, dtype='int32')
    test_questions_mask = np.asarray(test_questions_mask, dtype=theano.config.floatX)
    test_paras = np.asarray(test_paras, dtype='int32')
    test_paras_mask = np.asarray(test_paras_mask, dtype=theano.config.floatX)

    test_e_ids = np.asarray(test_e_ids, dtype='int32')
    test_e_masks = np.asarray(test_e_masks, dtype=theano.config.floatX)
    test_c_ids = np.asarray(test_c_ids, dtype='int32')
    test_c_masks = np.asarray(test_c_masks, dtype=theano.config.floatX)

    test_c_heads = np.asarray(test_c_heads, dtype='int32')
    test_c_tails = np.asarray(test_c_tails, dtype='int32')
    test_l_heads = np.asarray(test_l_heads, dtype='int32')
    test_l_tails = np.asarray(test_l_tails, dtype='int32')
    test_e_heads = np.asarray(test_e_heads, dtype='int32')
    test_e_tails = np.asarray(test_e_tails, dtype='int32')
    test_labels = np.asarray(test_labels, dtype='int32')

    overall_vocab_size=len(word2id)
    print 'train size:', train_size, 'test size:', test_size, 'vocab size:', overall_vocab_size


    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, rng)
#     rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
#     id2word = {y:x for x,y in word2id.iteritems()}
#     word2vec=load_word2vec()
#     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()

    para=T.imatrix()  #(2*batch, len)
    para_mask=T.fmatrix() #(2*batch, len)

    c_ids=T.imatrix()  #(2*batch, len)
    c_mask=T.fmatrix() #(2*batch, len)
    e_ids=T.imatrix()  #(2*batch, len)
    e_mask=T.fmatrix() #(2*batch, len)

    c_heads=T.ivector() #batch
    c_tails=T.ivector() #batch
    l_heads=T.ivector() #batch
    l_tails=T.ivector() #batch
    e_heads=T.ivector() #batch
    e_tails=T.ivector() #batch
    q=T.imatrix()  #(2*batch, len_q)
    q_mask=T.fmatrix() #(2*batch, len_q)
    labels=T.ivector() #batch





    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    true_batch_size = para.shape[0]

#     U_p, W_p, b_p=create_GRU_para(rng, emb_size, hidden_size)
#     U_p_b, W_p_b, b_p_b=create_GRU_para(rng, emb_size, hidden_size)
#     GRU_p_para=[U_p, W_p, b_p, U_p_b, W_p_b, b_p_b]
#     
#     U_q, W_q, b_q=create_GRU_para(rng, emb_size, hidden_size)
#     U_q_b, W_q_b, b_q_b=create_GRU_para(rng, emb_size, hidden_size)
#     GRU_q_para=[U_q, W_q, b_q, U_q_b, W_q_b, b_q_b]
    
    paragraph_input = embeddings[para.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len)
    q_input = embeddings[q.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len)


    fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
    paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
    paragraph_reps_tensor3=paragraph_model.output_tensor #(batch, 2*hidden, paralen)

#     paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U_p,W=W_p,b=b_p,Ub=U_p_b,Wb=W_p_b,bb=b_p_b)
#     paragraph_reps_tensor3=paragraph_model.output_tensor_conc #(batch, 2*hidden, para_len)


    fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    question_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
    questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(q_input, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
    q_reps=questions_model.output_sent_rep_maxpooling #(batch, 2*hidden)

#     q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U_q,W=W_q,b=b_q,Ub=U_q_b,Wb=W_q_b,bb=b_q_b)
#     q_reps=q_model.output_sent_rep_conc #(batch, 2*hidden)

    #interaction
    batch_ids=T.arange(true_batch_size)
    c_heads_reps=paragraph_reps_tensor3[batch_ids,:,c_heads] #(batch, 2*hidden)
    c_tails_reps=paragraph_reps_tensor3[batch_ids,:,c_tails] #(batch, 2*hidden)
    candididates_reps=T.concatenate([c_heads_reps, c_tails_reps], axis=1) #(batch, 4*hidden)

    l_heads_reps=paragraph_reps_tensor3[batch_ids,:,l_heads] #(batch, 2*hidden)
    l_tails_reps=paragraph_reps_tensor3[batch_ids,:,l_tails] #(batch, 2*hidden)
    longs_reps=T.concatenate([l_heads_reps, l_tails_reps], axis=1) #(batch, 4*hidden)

    e_heads_reps=paragraph_reps_tensor3[batch_ids,:,e_heads] #(batch, 2*hidden)
    e_tails_reps=paragraph_reps_tensor3[batch_ids,:,e_tails] #(batch, 2*hidden)
    extensions_reps=T.concatenate([e_heads_reps, e_tails_reps], axis=1) #(batch, 4*hidden)
    

    #glove level average
    c_input = embeddings[c_ids.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len)
    c_sum = T.sum(c_input*c_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size)
    average_C_batch = c_sum/T.sqrt(T.sum(c_sum**2, axis=1)+1e-20).dimshuffle(0,'x')

    e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len)
    e_sum = T.sum(e_input*e_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size)
    average_E_batch = e_sum/T.sqrt(T.sum(e_sum**2, axis=1)+1e-20).dimshuffle(0,'x')    

#     e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len)
    q_sum = T.sum(q_input*q_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size)
    average_Q_batch = q_sum/T.sqrt(T.sum(q_sum**2, axis=1)+1e-20).dimshuffle(0,'x')    
    #classify


    HL_layer_1_input_size=14*hidden_size+3*emb_size
    
    HL_layer_1_input = T.concatenate([q_reps, longs_reps, extensions_reps, candididates_reps, average_E_batch, average_C_batch, average_Q_batch], axis=1) #(batch, 14*hidden)
    
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=HL_hidden_size, activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh)
    



    
    LR_input=HL_layer_2.output #T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output], axis=1) #(batch, 10*hidden)
    LR_input_size= HL_hidden_size#HL_layer_1_input_size+2*HL_hidden_size
    U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
    norm_U_a=normalize_matrix(U_a)
    LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class  
    LR_para=[U_a, LR_b]
    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=norm_U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    






    params = LR_para+[embeddings]+paragraph_para+question_para+HL_layer_1.params+HL_layer_2.params
    load_model_from_file(storePath+'Best_Paras_HS_v2_000_withSumNorm_0.706428571429', params)
    
#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=loss#+0.0005*T.mean(U_a**2)


    accumulator=[]
    for para_i in params:
        eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20)))   #AdaGrad
        updates.append((acc_i, acc))


    train_model = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], cost, updates=updates,on_unused_input='ignore')

    train_model_pred = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], layer_LR.y_pred, on_unused_input='ignore')


    test_model = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], [layer_LR.errors(labels),layer_LR.y_pred, layer_LR.prop_for_posi], on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size    #batch_size means how many pairs
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] 

    n_train_batches_pred=train_size/batch_size_pred    #batch_size means how many pairs
    train_batch_start_pred=list(np.arange(n_train_batches_pred)*batch_size_pred)+[train_size-batch_size_pred] 

    n_test_batches=test_size/test_batch_size    #batch_size means how many pairs
    n_test_batches_remain=test_size%test_batch_size
    test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_size-n_test_batches_remain]




    max_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
    train_ids_pred = range(train_size)
    best_test_statistic=defaultdict(int)
#     best_train_statistic=defaultdict(int)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        random.shuffle(train_ids)
#         print train_ids[:100]
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1

            iter_accu+=1
            train_id_list = train_ids[para_id:para_id+batch_size]
#             print 'train_labels[train_id_list]:', train_labels[train_id_list]
#             cost_i+= train_model(
#                                 train_paras[train_id_list],
#                                 train_paras_mask[train_id_list],
#                                 train_c_heads[train_id_list],
#                                 train_c_tails[train_id_list],
#                                 train_l_heads[train_id_list],
#                                 train_l_tails[train_id_list],
#                                 train_e_heads[train_id_list],
#                                 train_e_tails[train_id_list],
#                                 train_questions[train_id_list],
#                                 train_questions_mask[train_id_list],
#                                 train_labels[train_id_list])

            #print iter
            if  iter%1==0: #iter>=200 and
                print 'Epoch ', epoch, 'iter '+str(iter)+'/'+str(len(train_batch_start))+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'

                past_time = time.time()
#                 print 'Training Pred...'
#                 train_statistic=defaultdict(int)
#                 for para_id in train_batch_start_pred:
#                     train_id_list = train_ids_pred[para_id:para_id+batch_size_pred]
#                     gold_train_labels_list = train_labels_3c[train_id_list]
# #                     print 'train_id_list:', train_id_list
# #                     print 'train_c_heads[train_id_list]:', train_c_heads[train_id_list]
#                     train_preds_i= train_model_pred(
#                                         train_paras[train_id_list],
#                                         train_paras_mask[train_id_list],
#                                         train_c_heads[train_id_list],
#                                         train_c_tails[train_id_list],
#                                         train_l_heads[train_id_list],
#                                         train_l_tails[train_id_list],
#                                         train_e_heads[train_id_list],
#                                         train_e_tails[train_id_list],
#                                         train_questions[train_id_list],
#                                         train_questions_mask[train_id_list],
#                                         train_labels[train_id_list])  
# 
#                     for ind, gold_label in enumerate(gold_train_labels_list):
#                         train_statistic[(gold_label, train_preds_i[ind])]+=1   
#                     train_acc= (train_statistic.get((1,1),0)+train_statistic.get((0,0),0))*1.0/(train_statistic.get((1,1),0)+train_statistic.get((0,0),0)+train_statistic.get((1,0),0)+train_statistic.get((0,1),0))
#                             
#                 print '\t\tcurrnt train acc:', train_acc, ' train_statistic:', train_statistic
                print 'Testing...'
                error=0
                test_statistic=defaultdict(int)
                writefile=open(storePath+'prob_for_GOOD_000_sumnorm.txt', 'w')
                for test_para_id in test_batch_start:
#                     test_id_list = range(test_para_id, test_para_id+test_batch_size)   
#                     print 'test_id_list:',test_id_list    
#                     print 'test_c_heads[test_id_list]', test_c_heads[test_id_list]
                    gold_labels_list = test_labels_3c[test_para_id:test_para_id+test_batch_size]
                    error_i, preds_i, pos_prob_i= test_model(
                                        test_paras[test_para_id:test_para_id+test_batch_size],
                                        test_paras_mask[test_para_id:test_para_id+test_batch_size],
                                        test_c_ids[test_para_id:test_para_id+test_batch_size],
                                        test_c_masks[test_para_id:test_para_id+test_batch_size],
                                        test_e_ids[test_para_id:test_para_id+test_batch_size],
                                        test_e_masks[test_para_id:test_para_id+test_batch_size],
                                        test_c_heads[test_para_id:test_para_id+test_batch_size],
                                        test_c_tails[test_para_id:test_para_id+test_batch_size],
                                        test_l_heads[test_para_id:test_para_id+test_batch_size],
                                        test_l_tails[test_para_id:test_para_id+test_batch_size],
                                        test_e_heads[test_para_id:test_para_id+test_batch_size],
                                        test_e_tails[test_para_id:test_para_id+test_batch_size],
                                        test_questions[test_para_id:test_para_id+test_batch_size],
                                        test_questions_mask[test_para_id:test_para_id+test_batch_size],
                                        test_labels[test_para_id:test_para_id+test_batch_size])
                    writefile.write('\n'.join(map(str,pos_prob_i))+'\n')

                    error+=error_i
                    for ind, gold_label in enumerate(gold_labels_list):
                        test_statistic[(gold_label, preds_i[ind])]+=1
#                 acc=1.0-error*1.0/len(test_batch_start)
                print 'prob_for_GOOD.txt written over.'
                writefile.close()
                acc= (test_statistic.get((1,1),0)+test_statistic.get((0,0),0))*1.0/(test_statistic.get((1,1),0)+test_statistic.get((0,0),0)+test_statistic.get((1,0),0)+test_statistic.get((0,1),0))
                
                if acc> max_acc:
                    max_acc=acc
                    best_test_statistic=test_statistic
#                     store_model_to_file(storePath+'Best_Paras_HS_v2_'+str(max_acc), params)
#                     print 'Finished storing best  params at:', max_acc
                print 'current average acc:', acc, '\t\tmax acc:', max_acc, '\ttest_statistic:', test_statistic
                print '\t\t\t\tbest statistic:', best_test_statistic
                exit(0)



            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
예제 #11
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=2000,
                    batch_size=100,
                    emb_size=10,
                    hidden_size=10,
                    L2_weight=0.0001,
                    para_len_limit=400,
                    q_len_limit=40,
                    max_EM=0.217545454546):

    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train(
        para_len_limit, q_len_limit)
    train_size = len(train_para_list)
    if train_size != len(train_Q_list) or train_size != len(
            train_label_list) or train_size != len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test(
        word2id, para_len_limit, q_len_limit)
    test_size = len(test_para_list)
    if test_size != len(test_Q_list) or test_size != len(
            test_mask) or test_size != len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)

    rand_values = random_value_normal((overall_vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    #     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #     id2word = {y:x for x,y in overall_word2id.iteritems()}
    #     word2vec=load_word2vec()
    #     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    labels = T.imatrix('labels')
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')
    extraF = T.ftensor3('extraF')  # should be in shape (batch, wordsize, 3)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    norm_extraF = normalize_matrix(extraF)

    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b]

    UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size)
    Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b]

    W_a1 = create_ensemble_para(
        rng, hidden_size,
        hidden_size)  # init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size + 3)  # 3 extra features
    LR_b = theano.shared(
        value=numpy.zeros((2, ),
                          dtype=theano.config.floatX),  # @UndefinedVariable
        name='LR_b',
        borrow=True)

    attention_paras = [W_a1, W_a2, U_a, LR_b]
    params = [embeddings] + paragraph_para + Q_para + attention_paras

    load_model_from_file(rootPath + 'Best_Paras_conv_0.217545454545', params)

    paragraph_input = embeddings[paragraph.flatten()].reshape(
        (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose(
            (0, 2, 1))  # (batch_size, emb_size, maxparalen)
    concate_paragraph_input = T.concatenate(
        [paragraph_input, norm_extraF.dimshuffle((0, 2, 1))], axis=1)

    paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=paragraph_input,
        Mask=para_mask,
        hidden_dim=hidden_size,
        U=U1,
        W=W1,
        b=b1,
        Ub=U1_b,
        Wb=W1_b,
        bb=b1_b)
    para_reps = paragraph_model.output_tensor  #(batch, emb, para_len)

    #     #LSTM
    #     fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
    #     paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
    #     para_reps=paragraph_model.output_tensor

    Qs_emb = embeddings[questions.flatten()].reshape(
        (questions.shape[0], questions.shape[1], emb_size)).transpose(
            (0, 2, 1))  #(#questions, emb_size, maxsenlength)

    questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=Qs_emb,
        Mask=q_mask,
        hidden_dim=hidden_size,
        U=UQ,
        W=WQ,
        b=bQ,
        Ub=UQ_b,
        Wb=WQ_b,
        bb=bQ_b)
    #     questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size)
    questions_reps_tensor = questions_model.output_tensor

    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)

    #     #LSTM for questions
    #     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    #     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
    #     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
    #     questions_reps_tensor=questions_model.output_tensor

    #use CNN for question modeling
    #     Qs_emb_tensor4=Qs_emb.dimshuffle((0,'x', 1,2)) #(batch_size, 1, emb+3, maxparalen)
    #     conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, 5))
    #     Q_conv_para=[conv_W, conv_b]
    #     conv_model = Conv_with_input_para(rng, input=Qs_emb_tensor4,
    #             image_shape=(batch_size, 1, emb_size, q_len_limit),
    #             filter_shape=(hidden_size, 1, emb_size, 5), W=conv_W, b=conv_b)
    #     conv_output=conv_model.narrow_conv_out.reshape((batch_size, hidden_size, q_len_limit-5+1)) #(batch, 1, hidden_size, maxparalen-1)
    #     gru_mask=(q_mask[:,:-4]*q_mask[:,1:-3]*q_mask[:,2:-2]*q_mask[:,3:-1]*q_mask[:,4:]).reshape((batch_size, 1, q_len_limit-5+1))
    #     masked_conv_output=conv_output*gru_mask
    #     questions_conv_reps=T.max(masked_conv_output, axis=2).reshape((batch_size, 1, hidden_size))

    #     new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0)
    #     ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2)
    #     ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction
    #     padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX)
    #     ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1)
    #     ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1)
    #     ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad)

    #
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (hidden, len)
        transpose_para_matrix = para_matrix.T
        interaction_matrix = T.dot(transpose_para_matrix,
                                   q_matrix)  #(para_len, q_len)
        norm_interaction_matrix = T.nnet.softmax(interaction_matrix)
        return T.dot(q_matrix, norm_interaction_matrix.T)  #(len, para_len)

    batch_q_reps, updates = theano.scan(
        fn=example_in_batch,
        outputs_info=None,
        sequences=[para_reps, questions_reps_tensor
                   ])  #batch_q_reps (batch, hidden, para_len)

    #attention distributions

    norm_W_a1 = normalize_matrix(W_a1)
    norm_W_a2 = normalize_matrix(W_a2)
    norm_U_a = normalize_matrix(U_a)

    transformed_para_reps = T.maximum(
        T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2), 0.0)  #relu
    transformed_q_reps = T.maximum(
        T.dot(batch_q_reps.transpose((0, 2, 1)), norm_W_a1), 0.0)
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)

    add_both = transformed_para_reps + transformed_q_reps

    #     U_c, W_c, b_c=create_GRU_para(rng, hidden_size, hidden_size)
    #     U_c_b, W_c_b, b_c_b=create_GRU_para(rng, hidden_size, hidden_size)
    #     accumu_para=[U_c, W_c, b_c, U_c_b, W_c_b, b_c_b]
    #     accumu_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_both.transpose((0,2,1)), Mask=para_mask, hidden_dim=hidden_size,U=U_c,W=W_c,b=b_c,Ub=U_c_b,Wb=W_c_b,bb=b_c_b)
    #     accu_both=accumu_model.output_tensor.transpose((0,2,1))

    prior_att = T.concatenate([add_both, norm_extraF], axis=2)

    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices = para_mask.flatten().nonzero()[0]

    layer3 = LogisticRegression(rng,
                                input=prior_att.reshape(
                                    (batch_size * prior_att.shape[1],
                                     hidden_size + 3)),
                                n_in=hidden_size + 3,
                                n_out=2,
                                W=norm_U_a,
                                b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.sum(
        T.log(layer3.p_y_given_x)
        [valid_indices,
         labels.flatten()[valid_indices]])  #[T.arange(y.shape[0]), y])

    distributions = layer3.p_y_given_x[:, -1].reshape(
        (batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
    #     masked_dis=(distributions+ConvGRU_1_dis_into_unigram)*para_mask
    masked_dis = distributions * para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()
    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''

    #     def AttentionLayer(q_rep, ext_M):
    #         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
    #         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
    #        f __name__ == '__main__':
    #         prior_att=T.concatenate([prior_att, ext_M], axis=1)
    #
    #         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
    #         return strength.transpose() #(1, #words)

    #     distributions, updates = theano.scan(
    #     AttentionLayer,
    #     sequences=[questions_reps,extraF] )

    #     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
    #     labels=debug_print(labels, 'labels')
    #     label_mask=T.gt(labels,0.0)
    #     neg_label_mask=T.lt(labels,0.0)
    #     dis_masked=distributions*label_mask
    #     remain_dis_masked=distributions*neg_label_mask
    #     pos_error=((dis_masked-1)**2).mean()
    #     neg_error=((remain_dis_masked-(-1))**2).mean()
    #     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

    L2_reg = L2norm_paraList(
        [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost = error  #+ConvGRU_1.error#

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [paragraph, questions, labels, para_mask, q_mask, extraF],
        cost,
        updates=updates,
        on_unused_input='ignore')

    test_model = theano.function(
        [paragraph, questions, para_mask, q_mask, extraF],
        masked_dis,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(
        numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_F1_acc = 0.0
    max_exact_acc = 0.0
    cost_i = 0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            #             haha=para_mask[para_id:para_id+batch_size]
            #             print haha
            #             for i in range(batch_size):
            #                 print len(haha[i])
            cost_i += train_model(
                np.asarray([
                    train_para_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_Q_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_label_list[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype='int32'),
                np.asarray([
                    train_para_mask[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX),
                np.asarray([
                    train_mask[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX),
                np.asarray([
                    train_feature_matrixlist[id]
                    for id in train_ids[para_id:para_id + batch_size]
                ],
                           dtype=theano.config.floatX))

            #print iter
            if iter % 10 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()

                exact_match = 0.0
                F1_match = 0.0
                q_amount = 0
                for test_para_id in test_batch_start:
                    distribution_matrix = test_model(
                        np.asarray(test_para_list[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype='int32'),
                        np.asarray(test_Q_list[test_para_id:test_para_id +
                                               batch_size],
                                   dtype='int32'),
                        np.asarray(test_para_mask[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(test_mask[test_para_id:test_para_id +
                                             batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(
                            test_feature_matrixlist[test_para_id:test_para_id +
                                                    batch_size],
                            dtype=theano.config.floatX))

                    #                     print distribution_matrix
                    test_para_wordlist_list = test_text_list[
                        test_para_id:test_para_id + batch_size]
                    para_gold_ansset_list = q_ansSet_list[
                        test_para_id:test_para_id + batch_size]
                    paralist_extra_features = test_feature_matrixlist[
                        test_para_id:test_para_id + batch_size]
                    sub_para_mask = test_para_mask[test_para_id:test_para_id +
                                                   batch_size]
                    para_len = len(test_para_wordlist_list[0])
                    if para_len != len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(
                            distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount += batch_size
                    #                     print q_size
                    #                     print test_para_word_list

                    Q_list_inword = test_Q_list_word[
                        test_para_id:test_para_id + batch_size]
                    for q in range(batch_size):  #for each question
                        #                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
                        #                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
                        #                         else:
                        #                             ss=len(distribution_matrix[q])
                        #                             combine_list=[]
                        #                             for ii in range(ss):
                        #                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
                        #                             print combine_list
                        #                         exit(0)
                        #                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans = extract_ansList_attentionList(
                            test_para_wordlist_list[q], distribution_matrix[q],
                            np.asarray(paralist_extra_features[q],
                                       dtype=theano.config.floatX),
                            sub_para_mask[q], Q_list_inword[q])
                        q_gold_ans_set = para_gold_ansset_list[q]
                        #                         print test_para_wordlist_list[q]
                        #                         print Q_list_inword[q]
                        #                         print pred_ans.encode('utf8'), q_gold_ans_set
                        if pred_ans in q_gold_ans_set:
                            exact_match += 1
                        F1 = MacroF1(pred_ans, q_gold_ans_set)
                        F1_match += F1


#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                F1_acc = F1_match / q_amount
                exact_acc = exact_match / q_amount
                if F1_acc > max_F1_acc:
                    max_F1_acc = F1_acc
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(
                            rootPath + 'Best_Paras_conv_' + str(max_exact_acc),
                            params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))