def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300,
                    L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5],
                    char_filter_size=4, margin=0.5, max_EM=50.302743615):
    test_batch_size=batch_size*10
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)


    word2id={}
    char2id={}
    #questions,paragraphs,q_masks,p_masks,labels, word2id
    train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len)
    train_size=len(train_para_list)

    test_Q_list, test_para_list,  test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list,  test_Q_char_mask, test_para_char_mask, test_label_list, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len)
    test_size=len(test_para_list)

    train_Q_list = numpy.asarray(train_Q_list, dtype='int32')
    train_para_list = numpy.asarray(train_para_list, dtype='int32')
    train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX)
    train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX)

    train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32')
    train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32')
    train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX)
    train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX)

    train_label_list = numpy.asarray(train_label_list, dtype='int32')

    test_Q_list = numpy.asarray(test_Q_list, dtype='int32')
    test_para_list = numpy.asarray(test_para_list, dtype='int32')
    test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX)
    test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX)

    test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32')
    test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32')
    test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX)
    test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX)



    vocab_size = len(word2id)
    print 'vocab size: ', vocab_size
    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng)
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_glove()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)

    char_size = len(char2id)
    print 'char size: ', char_size
    char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    gold_indices= T.ivector() #batch, one gold word for each sample
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')

    char_paragraph = T.imatrix() #(batch, char_len*p_len)
    char_questions = T.imatrix()
    char_para_mask=T.fmatrix()
    char_q_mask=T.fmatrix()

#     true_p_len = T.iscalar()



    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size = paragraph.shape[0]
    true_p_len = paragraph.shape[1]

    common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM
    common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size))


    char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM
    char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size))

    char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len))
    char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len))


    char_LSTM_para_dict=create_LSTM_para(rng, char_emb_size, char_emb_size)
    char_LSTM_para_dict_bw=create_LSTM_para(rng, char_emb_size, char_emb_size)
    char_lstm_layer_p=Bd_LSTM_Batch_Tensor_Input_with_Mask(char_common_input_p.dimshuffle(0,2,1), char_p_masks,  char_emb_size, char_LSTM_para_dict,char_LSTM_para_dict_bw)
    char_word_embeddings_p = char_lstm_layer_p.output_sent_rep_conc.reshape((true_batch_size, true_p_len, 2*char_emb_size)).dimshuffle(0, 2,1) #(batch, 2*hidden)

    char_lstm_layer_q=Bd_LSTM_Batch_Tensor_Input_with_Mask(char_common_input_q.dimshuffle(0,2,1), char_q_masks,  char_emb_size, char_LSTM_para_dict,char_LSTM_para_dict_bw)
    char_word_embeddings_q = char_lstm_layer_q.output_sent_rep_conc.reshape((true_batch_size, q_len_limit, 2*char_emb_size)).dimshuffle(0, 2,1) #(batch, 2*hidden)


    LSTM_para_dict=create_LSTM_para(rng, 2*char_emb_size+emb_size,hidden_size)  #40+300
    LSTM_para_dict_bw=create_LSTM_para(rng, 2*char_emb_size+emb_size,hidden_size)

    p_input2lstm = T.concatenate([common_input_p.dimshuffle(0,2,1), char_word_embeddings_p], axis=1) #(batch, emb_size+char_emb_size, p_len)
    q_input2lstm = T.concatenate([common_input_q.dimshuffle(0,2,1), char_word_embeddings_q], axis=1) #(batch, emb_size+char_emb_size, p_len)

    lstm_layer_p=Bd_LSTM_Batch_Tensor_Input_with_Mask(p_input2lstm, para_mask,  hidden_size, LSTM_para_dict,LSTM_para_dict_bw)
    p_tensor3 = lstm_layer_p.output_tensor_conc #(batch, 2*hidden, p_len)

    lstm_layer_q=Bd_LSTM_Batch_Tensor_Input_with_Mask(q_input2lstm, q_mask,  hidden_size, LSTM_para_dict,LSTM_para_dict_bw)
    q_reps = lstm_layer_q.output_sent_rep_conc #(batch, 2*hidden)


    NN_para=char_LSTM_para_dict.values()+char_LSTM_para_dict_bw.values()+LSTM_para_dict.values()+LSTM_para_dict_bw.values()


    input4score = T.concatenate([p_tensor3, T.repeat(q_reps.dimshuffle(0,1,'x'), true_p_len, axis=2)], axis=1) #(batch, 4*hidden, p_len)




    HL_1_para = create_ensemble_para(rng, hidden_size, 4*hidden_size)
    HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 1, hidden_size)
    norm_U_a=normalize_matrix(U_a)
    norm_HL_1_para=normalize_matrix(HL_1_para)
    norm_HL_2_para=normalize_matrix(HL_2_para)
    norm_HL_3_para=normalize_matrix(HL_3_para)
    norm_HL_4_para=normalize_matrix(HL_4_para)


    span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, true_batch_size,true_p_len)
    span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len)
    loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(true_batch_size), gold_indices]))

    #ranking loss
    tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size)

    index_matrix = T.zeros((true_batch_size, p_len_limit), dtype=theano.config.floatX)
    new_index_matrix = T.set_subtensor(index_matrix[T.arange(true_batch_size), gold_indices], 1.0)


    prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()]
    prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()]

    repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0)
    repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten()
    loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega))

    loss = loss_neg_likelihood + loss_rank

    #test
    mask_test_return=T.argmax(span_scores_matrix*para_mask, axis=1) #batch



    params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]

#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=loss#+ConvGRU_1.error#


    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask,    char_paragraph, #(batch, char_len*p_len)
        char_questions, char_para_mask, char_q_mask], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([paragraph, questions,para_mask, q_mask,
        char_paragraph,
        char_questions,
        char_para_mask,
        char_q_mask], mask_test_return, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/test_batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size]


    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_ids[para_id:para_id+batch_size]
            cost_i+= train_model(
                                 train_para_list[train_id_batch],
                                 train_Q_list[train_id_batch],
                                 train_label_list[train_id_batch],
                                 train_para_mask[train_id_batch],
                                 train_Q_mask[train_id_batch],
                                 train_para_char_list[train_id_batch],
                                 train_Q_char_list[train_id_batch],
                                 train_para_char_mask[train_id_batch],
                                 train_Q_char_mask[train_id_batch])


            #print iter
            if iter%100==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                # pred_dict={}
                q_amount=0
                p1=0
                for test_para_id in test_batch_start:
                    batch_predict_ids=test_model(
                                                 test_para_list[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_list[test_para_id:test_para_id+test_batch_size],
                                                 test_para_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_para_char_list[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_char_list[test_para_id:test_para_id+test_batch_size],
                                                 test_para_char_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_char_mask[test_para_id:test_para_id+test_batch_size])
                    # test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size]
                    test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size]
                    q_amount+=test_batch_size

                    for q in range(test_batch_size): #for each question
                        predict_id = batch_predict_ids[q]
                        ground_ids=test_label_batch[q]
                        if predict_id in set(ground_ids):
                            p1+=1
#                         print batch_predict_ids[q], mask_batch_predict_ids[q], test_p_len_limit - numpy.sum(test_para_mask[test_para_id+q]), scores_i[q], test_para_mask[test_para_id+q]
                exact_acc = p1*100.0/q_amount
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
#                     if max_exact_acc > max_EM:
#                         store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params)
#                         print 'Finished storing best  params at:', max_exact_acc
                print '\t\tcurrent  exact:', exact_acc, '\t\t\tmax exact_acc:', max_exact_acc






            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #2
0
def evaluate_lenet5(learning_rate=0.5, n_epochs=2000, batch_size=500, emb_size=300, hidden_size=300,
                    L2_weight=0.0001, para_len_limit=700, q_len_limit=40):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train(para_len_limit, q_len_limit)
    train_size=len(train_para_list)
    if train_size!=len(train_Q_list) or train_size!=len(train_label_list) or train_size!=len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist= load_dev_or_test(word2id, para_len_limit, q_len_limit)
    test_size=len(test_para_list)
    if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)

    id2word = {y:x for x,y in overall_word2id.iteritems()}
    word2vec=load_word2vec()
    


    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
#     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')   
    questions = T.imatrix('questions')  
    labels = T.imatrix('labels')
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')
    extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3)


    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    paragraph_input = embeddings[paragraph.flatten()].reshape((paragraph.shape[0], paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen)
#     
# #     BdGRU(rng, str(0), shape, X, mask, is_train = 1, batch_size = 1, p = 0.5)
#     
    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b] 
    paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    para_reps=paragraph_model.output_tensor #(batch, emb, para_len)




 
    Qs_emb = embeddings[questions.flatten()].reshape((questions.shape[0], questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength)
    UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size)
    Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b] 
    questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b)
    questions_reps=questions_model.output_sent_rep_maxpooling.reshape((batch_size, 1, hidden_size)) #(batch, 2*out_size)
    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)
    
    
    #attention distributions
    W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size+3) # 3 extra features
    
    norm_W_a1=normalize_matrix(W_a1)
    norm_W_a2=normalize_matrix(W_a2)
    norm_U_a=normalize_matrix(U_a)

    LR_b = theano.shared(value=numpy.zeros((2,),
                                                 dtype=theano.config.floatX),  # @UndefinedVariable
                               name='LR_b', borrow=True)
     
    attention_paras=[W_a1, W_a2, U_a, LR_b]
    
    transformed_para_reps=T.tanh(T.dot(para_reps.transpose((0, 2,1)), norm_W_a2))
    transformed_q_reps=T.tanh(T.dot(questions_reps, norm_W_a1))
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)    
    
    add_both=0.5*(transformed_para_reps+transformed_q_reps)
    prior_att=T.concatenate([add_both, normalize_matrix(extraF)], axis=2)
    
    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices=para_mask.flatten().nonzero()[0]
    
    layer3=LogisticRegression(rng, input=prior_att.reshape((batch_size*prior_att.shape[1], hidden_size+3)), n_in=hidden_size+3, n_out=2, W=norm_U_a, b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.mean(T.log(layer3.p_y_given_x)[valid_indices, labels.flatten()[valid_indices]])#[T.arange(y.shape[0]), y])

    distributions=layer3.p_y_given_x[:,-1].reshape((batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
    masked_dis=distributions*para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()


    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''
   
#     def AttentionLayer(q_rep, ext_M):
#         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
#         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
#        f __name__ == '__main__': 
#         prior_att=T.concatenate([prior_att, ext_M], axis=1)
#                               
#         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
#         return strength.transpose() #(1, #words)
 
#     distributions, updates = theano.scan(
#     AttentionLayer,
#     sequences=[questions_reps,extraF] )
    
#     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
#     labels=debug_print(labels, 'labels')
#     label_mask=T.gt(labels,0.0)
#     neg_label_mask=T.lt(labels,0.0)
#     dis_masked=distributions*label_mask
#     remain_dis_masked=distributions*neg_label_mask
#     pos_error=((dis_masked-1)**2).mean()
#     neg_error=((remain_dis_masked-(-1))**2).mean()
#     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error
    


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [embeddings]+paragraph_para+Q_para+attention_paras
    L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=error#+L2_weight*L2_reg
    
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
        
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
  
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))    



    train_model = theano.function([paragraph, questions,labels, para_mask, q_mask, extraF], error, updates=updates,on_unused_input='ignore')
    
    test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], masked_dis, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False
    

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)+[test_size-batch_size]

        
    max_exact_acc=0.0
    cost_i=0.0
    
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #shuffle(train_batch_start)
        iter_accu=0
        for para_id in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
#             haha=para_mask[para_id:para_id+batch_size]
#             print haha
#             for i in range(batch_size):
#                 print len(haha[i])
            cost_i+= train_model(
                                np.asarray(train_para_list[para_id:para_id+batch_size], dtype='int32'), 
                                      np.asarray(train_Q_list[para_id:para_id+batch_size], dtype='int32'), 
                                      np.asarray(train_label_list[para_id:para_id+batch_size], dtype='int32'), 
                                      np.asarray(train_para_mask[para_id:para_id+batch_size], dtype=theano.config.floatX),
                                      np.asarray(train_mask[para_id:para_id+batch_size], dtype=theano.config.floatX),
                                      np.asarray(train_feature_matrixlist[para_id:para_id+batch_size], dtype=theano.config.floatX))

            #print iter
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                  
                exact_match=0.0
                q_amount=0
                for test_para_id in test_batch_start:
                    distribution_matrix=test_model(
                                        np.asarray(test_para_list[test_para_id:test_para_id+batch_size], dtype='int32'), 
                                              np.asarray(test_Q_list[test_para_id:test_para_id+batch_size], dtype='int32'), 
                                              np.asarray(test_para_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX),
                                              np.asarray(test_mask[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX),
                                              np.asarray(test_feature_matrixlist[test_para_id:test_para_id+batch_size], dtype=theano.config.floatX))
                    
#                     print distribution_matrix
                    test_para_wordlist_list=test_text_list[test_para_id:test_para_id+batch_size]
                    para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+batch_size]
                    paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size]
                    sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size]
                    para_len=len(test_para_wordlist_list[0])
                    if para_len!=len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount+=batch_size
#                     print q_size
#                     print test_para_word_list
                    for q in range(batch_size): #for each question
#                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
#                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
#                         else:
#                             ss=len(distribution_matrix[q])
#                             combine_list=[]
#                             for ii in range(ss):
#                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
#                             print combine_list
#                         exit(0)
#                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], np.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q])
                        q_gold_ans_set=para_gold_ansset_list[q]
                         
                        F1=MacroF1(pred_ans, q_gold_ans_set)
                        exact_match+=F1
#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                exact_acc=exact_match/q_amount
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                print 'current average F1:', exact_acc, '\t\tmax F1:', max_exact_acc
                        



            if patience <= iter:
                done_looping = True
                break
        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=10, test_batch_size=200, emb_size=300, hidden_size=100,
                    L2_weight=0.0001, para_len_limit=300, q_len_limit=30, max_EM=40.0):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)


#     glove_vocab=set(word2vec.keys())
    train_para_list, train_Q_list, train_start_list,train_end_list,  train_para_mask, train_mask, word2id, train_feature_matrixlist=load_train_AI2(para_len_limit, q_len_limit)
    train_size=len(train_para_list)
    if train_size!=len(train_Q_list) or train_size!=len(train_start_list) or train_size!=len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_Q_list_word, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist, q_idlist= load_dev_or_test_AI2(word2id, para_len_limit, q_len_limit)
    test_size=len(test_para_list)
    if test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)





    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in overall_word2id.iteritems()}
    word2vec=load_glove()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
#     labels = T.imatrix('labels')  #(batch, para_len)
    start_indices= T.ivector() #batch
    end_indices = T.ivector() #batch
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')
    extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3)



    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size=paragraph.shape[0]

    norm_extraF=normalize_matrix(extraF)

    fwd_para=create_LSTM_para(rng, emb_size, hidden_size)  #create_LSTM_para(rng, word_dim, hidden_dim)
    bwd_para=create_LSTM_para(rng, emb_size, hidden_size)
    paragraph_para=fwd_para.values()+ bwd_para.values()

    fwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size)  #create_LSTM_para(rng, word_dim, hidden_dim)
    bwd_e1=create_LSTM_para(rng, 8*hidden_size, hidden_size)
    paragraph_para_e1=fwd_e1.values()+ bwd_e1.values()

    fwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size)  #create_LSTM_para(rng, word_dim, hidden_dim)
    bwd_e11=create_LSTM_para(rng, 2*hidden_size, hidden_size)
    paragraph_para_e11=fwd_e11.values()+ bwd_e11.values()

    fwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size)  #create_LSTM_para(rng, word_dim, hidden_dim)
    bwd_e2=create_LSTM_para(rng, 2*hidden_size, hidden_size)
    paragraph_para_e2=fwd_e2.values()+ bwd_e2.values()

#     U_e2, W_e2, b_e2=create_GRU_para(rng, hidden_size, hidden_size)
#     U_e2_b, W_e2_b, b_e2_b=create_GRU_para(rng, hidden_size, hidden_size)
#     paragraph_para_e2=[U_e2, W_e2, b_e2, U_e2_b, W_e2_b, b_e2_b]

#     fwd_Q=create_LSTM_para(rng, emb_size, hidden_size)  #create_LSTM_para(rng, word_dim, hidden_dim)
#     bwd_Q=create_LSTM_para(rng, emb_size, hidden_size)
#     Q_para=fwd_Q.values()+ bwd_Q.values()

#     W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
#     W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a1 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features
    U_a2 = create_ensemble_para(rng, 1, 10*hidden_size) # 3 extra features
    U_a3 = create_ensemble_para(rng, 1, 6*hidden_size) # 3 extra features
#     LR_b = theano.shared(value=numpy.zeros((2,),
#                                                  dtype=theano.config.floatX),  # @UndefinedVariable
#                                name='LR_b', borrow=True)

    HL_paras=[U_a1, U_a2, U_a3]
    params = [embeddings]+paragraph_para+paragraph_para_e1+paragraph_para_e11+HL_paras+paragraph_para_e2

#     load_model_from_file(rootPath+'Best_Paras_AI2_31.210974456', params)

    paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen)


    #self, X, Mask, hidden_dim, fwd_tparams, bwd_tparams
    paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_para, bwd_tparams= bwd_para)
    para_reps=paragraph_model.output_tensor #(batch, 2*hidden, para_len)


    Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength)

    questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, fwd_tparams=fwd_para, bwd_tparams= bwd_para)
    questions_reps_tensor=questions_model.output_tensor #(batch, 2*hidden ,q_len)
#     questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden)
#     questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)  #(batch, para_len, hidden)

#     #LSTM for questions
#     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
#     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
#     questions_reps_tensor=questions_model.output_tensor





#     new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0)
#     ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2)
#     ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction
#     padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX)
#     ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1)
#     ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1)
#     ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad)


    norm_U_a3=normalize_matrix(U_a3)
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (2*hidden, len)

        repeat_para_matrix_T=T.repeat(para_matrix.T, q_matrix.shape[1], axis=0) #(para_len*q_len, 2*hidden)
        repeat_q_matrix_3D = T.repeat(q_matrix.T.dimshuffle('x',0,1), para_matrix.shape[1], axis=0) #(para_len, q_len, 2*hidden)
        repeat_q_matrix_T= repeat_q_matrix_3D.reshape((repeat_q_matrix_3D.shape[0]*repeat_q_matrix_3D.shape[1], repeat_q_matrix_3D.shape[2])) #(para_len*q_len, 2*hidden)

        ele_mult =repeat_para_matrix_T*repeat_q_matrix_T #(#(para_len*q_len, 2*hidden))
        overall_concv = T.concatenate([repeat_para_matrix_T, repeat_q_matrix_T, ele_mult], axis=1) ##(para_len*q_len, 6*hidden)
        scores=T.dot(overall_concv, norm_U_a3)  #(para_len*q_len,1)
        interaction_matrix=scores.reshape((para_matrix.shape[1], q_matrix.shape[1]))  #(para_len, q_len)


#         transpose_para_matrix=para_matrix.T
#         interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len)
        norm_interaction_matrix=T.nnet.softmax(interaction_matrix)
#         norm_interaction_matrix=T.maximum(0.0, interaction_matrix)
        q_by_para = T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(2*hidden, para_len)
        para_by_q = T.repeat(T.dot(para_matrix, T.nnet.softmax(T.max(interaction_matrix, axis=1).dimshuffle('x',0)).T), para_matrix.shape[1], axis=1)
        return (q_by_para, para_by_q)
    inter_return, updates = theano.scan(fn=example_in_batch,
                                   outputs_info=None,
                                   sequences=[para_reps, questions_reps_tensor])    #batch_q_reps (batch, hidden, para_len)

    batch_q_reps=inter_return[0] #(batch, 2*hidden, para_len)
    batch_para_reps=inter_return[1] #(batch, 2*hidden , para_len)

    #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len)
    ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps,para_reps*batch_q_reps, para_reps*batch_para_reps], axis=1) #(batch, 4*2*hidden, para_len) questions_reps.dimshuffle(0,2,1)
    para_ensemble_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e1, bwd_tparams= bwd_e1)
    para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, 2*hidden ,para_len)

    para_ensemble_model1=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e11, bwd_tparams= bwd_e11)
    para_reps_tensor4score1=para_ensemble_model1.output_tensor #(batch, 2*hidden ,para_len)


    Con_G_M=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score1], axis=1) #(batch, 10*hidden, para_len)

    #score for each para word
    norm_U_a=normalize_matrix(U_a1)
    start_scores=T.dot(Con_G_M.dimshuffle(0,2,1), norm_U_a)  #(batch, para_len, 1)
    start_scores=T.nnet.softmax(start_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len)

    # para_reps_tensor4score = T.concatenate([para_reps_tensor4score, start_scores.dimshuffle(0,'x',1)], axis=1)
    para_ensemble_model2=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(X=para_reps_tensor4score1, Mask=para_mask, hidden_dim=hidden_size,fwd_tparams=fwd_e2, bwd_tparams= bwd_e2)
    para_reps_tensor4score2=para_ensemble_model2.output_tensor #(batch, 2*hidden ,para_len)

    Con_G_M2=T.concatenate([ensemble_para_reps_tensor, para_reps_tensor4score2], axis=1) #(batch, 10*hidden, para_len)



    norm_U_a2=normalize_matrix(U_a2)
    end_scores=T.dot(Con_G_M2.dimshuffle(0,2,1), norm_U_a2)  #(batch, para_len, 1)
    end_scores=T.nnet.softmax(end_scores.reshape((true_batch_size, paragraph.shape[1]))) #(batch, para_len)


    #loss train

    loss=-T.mean(T.log(start_scores[T.arange(true_batch_size), start_indices])+T.log(end_scores[T.arange(true_batch_size), end_indices]))

    #test
    co_simi_batch_matrix=T.batched_dot((para_mask*start_scores).dimshuffle(0,1,'x'), (para_mask*end_scores).dimshuffle(0,'x',1)) #(batch, para_len, para_len)
    #reset lower dialgonal
    cols = numpy.concatenate([numpy.array(range(i), dtype=numpy.uint) for i in xrange(para_len_limit)])
    rows = numpy.concatenate([numpy.array([i]*i, dtype=numpy.uint) for i in xrange(para_len_limit)])
    c = T.set_subtensor(co_simi_batch_matrix[:,rows, cols], theano.shared(numpy.zeros(para_len_limit*(para_len_limit-1)/2)))
    #reset longer than 7 size
    cols2 = numpy.concatenate([numpy.array(range(i+7,para_len_limit), dtype=numpy.uint) for i in xrange(para_len_limit-7)])
    rows2 = numpy.concatenate([numpy.array([i]*(para_len_limit-7-i), dtype=numpy.uint) for i in xrange(para_len_limit-7)])
    c2 = T.set_subtensor(c[:,rows2, cols2], theano.shared(numpy.zeros((para_len_limit-7)*(para_len_limit-6)/2)))



    test_return=T.argmax(c2.reshape((true_batch_size, para_len_limit*para_len_limit)), axis=1) #batch


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=loss#+ConvGRU_1.error#


    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function([paragraph, questions,start_indices, end_indices,para_mask, q_mask, extraF], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([paragraph, questions,para_mask, q_mask, extraF], test_return, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/test_batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size]


    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
#             haha=para_mask[para_id:para_id+batch_size]
#             print haha
#             for i in range(batch_size):
#                 print len(haha[i])
            cost_i+= train_model(
                                numpy.asarray([train_para_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_Q_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_start_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_end_list[id] for id in train_ids[para_id:para_id+batch_size]], dtype='int32'),
                                      numpy.asarray([train_para_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      numpy.asarray([train_mask[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX),
                                      numpy.asarray([train_feature_matrixlist[id] for id in train_ids[para_id:para_id+batch_size]], dtype=theano.config.floatX))

            #print iter
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
#                 writefile=codecs.open(rootPath+'predictions.txt', 'w', 'utf-8')
#                 writefile.write('{')
                pred_dict={}
#                 exact_match=0.0
#                 F1_match=0.0
                q_amount=0
                for test_para_id in test_batch_start:
                    batch_predict_ids=test_model(
                                        numpy.asarray(test_para_list[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                              numpy.asarray(test_Q_list[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                              numpy.asarray(test_para_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                              numpy.asarray(test_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                              numpy.asarray(test_feature_matrixlist[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX))

#                     print distribution_matrix
                    test_para_wordlist_list=test_text_list[test_para_id:test_para_id+test_batch_size]
#                     para_gold_ansset_list=q_ansSet_list[test_para_id:test_para_id+test_batch_size]
                    q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size]
#                     print 'q_ids_batch:', q_ids_batch
                    # paralist_extra_features=test_feature_matrixlist[test_para_id:test_para_id+batch_size]
                    # sub_para_mask=test_para_mask[test_para_id:test_para_id+batch_size]
                    # para_len=len(test_para_wordlist_list[0])
                    # if para_len!=len(distribution_matrix[0]):
                    #     print 'para_len!=len(distribution_matrix[0]):', para_len, len(distribution_matrix[0])
                    #     exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount+=test_batch_size
#                     print q_size
#                     print test_para_word_list

#                     Q_list_inword=test_Q_list_word[test_para_id:test_para_id+test_batch_size]
                    for q in range(test_batch_size): #for each question
#                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
#                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
#                         else:
#                             ss=len(distribution_matrix[q])
#                             combine_list=[]
#                             for ii in range(ss):
#                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
#                             print combine_list
#                         exit(0)
#                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans=decode_predict_id_AI2(batch_predict_ids[q], para_len_limit, test_para_wordlist_list[q])
                        q_id=q_ids_batch[q]
                        pred_dict[q_id]=pred_ans
#                         writefile.write('"'+str(q_id)+'": "'+pred_ans+'", ')
                        # pred_ans=extract_ansList_attentionList(test_para_wordlist_list[q], distribution_matrix[q], numpy.asarray(paralist_extra_features[q], dtype=theano.config.floatX), sub_para_mask[q], Q_list_inword[q])
#                         q_gold_ans_set=para_gold_ansset_list[q]
# #                         print test_para_wordlist_list[q]
# #                         print Q_list_inword[q]
# #                         print pred_ans.encode('utf8'), q_gold_ans_set
#                         if pred_ans in q_gold_ans_set:
#                             exact_match+=1
#                         F1=MacroF1(pred_ans, q_gold_ans_set)
#                         F1_match+=F1
                with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt')
#                 F1_acc=F1_match/q_amount
#                 exact_acc=exact_match/q_amount
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(rootPath+'Best_Paras_AI2_'+str(max_exact_acc), params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc


#                 os.system('python evaluate-v1.1.py '+rootPath+'dev-v1.1.json '+rootPath+'predictions.txt')




            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=100, emb_size=300, char_emb_size=20, hidden_size=300,
                    L2_weight=0.0001, p_len_limit=400, test_p_len_limit=100, q_len_limit=20, char_len=15, filter_size = [5,5],
                    char_filter_size=3, margin=2.0, max_EM=50.302743615):
    test_batch_size=batch_size*10
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)


    word2id={}
    char2id={}
    #questions,paragraphs,q_masks,p_masks,labels, word2id
    train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id=load_squad_cnn_rank_word_train(word2id, char2id, p_len_limit, q_len_limit, char_len)
    train_size=len(train_para_list)

    test_Q_list, test_para_list,  test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list,  test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list= load_squad_cnn_rank_word_dev(word2id, char2id, test_p_len_limit, q_len_limit, char_len)
    test_size=len(test_para_list)

    train_Q_list = numpy.asarray(train_Q_list, dtype='int32')
    train_para_list = numpy.asarray(train_para_list, dtype='int32')
    train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX)
    train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX)

    train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32')
    train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32')
    train_Q_char_mask = numpy.asarray(train_Q_char_mask, dtype=theano.config.floatX)
    train_para_char_mask = numpy.asarray(train_para_char_mask, dtype=theano.config.floatX)

    train_label_list = numpy.asarray(train_label_list, dtype='int32')

    test_Q_list = numpy.asarray(test_Q_list, dtype='int32')
    test_para_list = numpy.asarray(test_para_list, dtype='int32')
    test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX)
    test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX)

    test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32')
    test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32')
    test_Q_char_mask = numpy.asarray(test_Q_char_mask, dtype=theano.config.floatX)
    test_para_char_mask = numpy.asarray(test_para_char_mask, dtype=theano.config.floatX)



    vocab_size = len(word2id)
    print 'vocab size: ', vocab_size
    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, rng)
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_glove()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)

    char_size = len(char2id)
    print 'char size: ', char_size
    char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, rng)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    gold_indices= T.imatrix() #batch, (start, end) for each sample
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')

    char_paragraph = T.imatrix() #(batch, char_len*p_len)
    char_questions = T.imatrix()
    char_para_mask=T.fmatrix()
    char_q_mask=T.fmatrix()

    true_p_len = T.iscalar()



    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size = paragraph.shape[0]

    common_input_p=embeddings[paragraph.flatten()].reshape((true_batch_size,true_p_len, emb_size)) #the input format can be adapted into CNN or GRU or LSTM
    common_input_q=embeddings[questions.flatten()].reshape((true_batch_size,q_len_limit, emb_size))


    char_common_input_p=char_embeddings[char_paragraph.flatten()].reshape((true_batch_size*true_p_len, char_len, char_emb_size)) #the input format can be adapted into CNN or GRU or LSTM
    char_common_input_q=char_embeddings[char_questions.flatten()].reshape((true_batch_size*q_len_limit, char_len, char_emb_size))

    char_p_masks = char_para_mask.reshape((true_batch_size*true_p_len, char_len))
    char_q_masks = char_q_mask.reshape((true_batch_size*q_len_limit, char_len))

    conv_W_char, conv_b_char=create_conv_para(rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size))
    conv_W_1, conv_b_1=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0]))
    conv_W_2, conv_b_2=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1]))

    conv_W_1_q, conv_b_1_q=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size+char_emb_size, filter_size[0]))
    conv_W_2_q, conv_b_2_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[1]))
    NN_para=[conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char]

    input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,batch_size, p_len_limit,q_len_limit,
                         emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size,
                         conv_W_1, conv_b_1,conv_W_2, conv_b_2,conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char,
                         para_mask, q_mask, char_p_masks,char_q_masks)

    test_input4score = squad_cnn_rank_word(rng, common_input_p, common_input_q, char_common_input_p, char_common_input_q,test_batch_size, test_p_len_limit,q_len_limit,
                         emb_size, char_emb_size,char_len,filter_size,char_filter_size,hidden_size,
                         conv_W_1, conv_b_1,conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q, conv_W_2_q, conv_b_2_q,conv_W_char,conv_b_char,
                         para_mask, q_mask, char_p_masks,char_q_masks)  #(batch, hidden, #(batch, 2*hidden, p_len_limit))

    # gram_size = 5*true_p_len-(0+1+2+3+4)


    HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size)
    HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 1, hidden_size)
    norm_U_a=normalize_matrix(U_a)
    norm_HL_1_para=normalize_matrix(HL_1_para)
    norm_HL_2_para=normalize_matrix(HL_2_para)
    norm_HL_3_para=normalize_matrix(HL_3_para)
    norm_HL_4_para=normalize_matrix(HL_4_para)

    end_HL_1_para = create_ensemble_para(rng, hidden_size, 2*hidden_size)
    end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_U_a = create_ensemble_para(rng, 1, hidden_size)
    end_norm_U_a=normalize_matrix(end_U_a)
    end_norm_HL_1_para=normalize_matrix(end_HL_1_para)
    end_norm_HL_2_para=normalize_matrix(end_HL_2_para)
    end_norm_HL_3_para=normalize_matrix(end_HL_3_para)
    end_norm_HL_4_para=normalize_matrix(end_HL_4_para)

    span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para, norm_U_a, batch_size,true_p_len)
    span_scores=T.nnet.softmax(span_scores_matrix) #(batch, para_len)
    end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para, end_norm_U_a, batch_size,true_p_len)
    end_span_scores=T.nnet.softmax(end_span_scores_matrix) #(batch, para_len)
    loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,0]]))
    end_loss_neg_likelihood=-T.mean(T.log(span_scores[T.arange(batch_size), gold_indices[:,1]]))

    #ranking loss start
    tanh_span_scores_matrix = span_scores#T.tanh(span_scores_matrix) #(batch, gram_size)
    index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX)
    new_index_matrix = T.set_subtensor(index_matrix[T.arange(batch_size), gold_indices[:,0]], 1.0)
    prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()]
    prob_batch_nega = tanh_span_scores_matrix[(1.0-new_index_matrix).nonzero()]
    repeat_posi = T.extra_ops.repeat(prob_batch_posi, prob_batch_nega.shape[0], axis=0)
    repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x',0), prob_batch_posi.shape[0], axis=0).flatten()
    loss_rank = T.mean(T.maximum(0.0, margin-repeat_posi+repeat_nega))

    #ranking loss END
    end_tanh_span_scores_matrix = end_span_scores#T.tanh(span_scores_matrix) #(batch, gram_size)
    end_index_matrix = T.zeros((batch_size, p_len_limit), dtype=theano.config.floatX)
    end_new_index_matrix = T.set_subtensor(end_index_matrix[T.arange(batch_size), gold_indices[:,1]], 1.0)
    end_prob_batch_posi = end_tanh_span_scores_matrix[end_new_index_matrix.nonzero()]
    end_prob_batch_nega = end_tanh_span_scores_matrix[(1.0-end_new_index_matrix).nonzero()]
    end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi, end_prob_batch_nega.shape[0], axis=0)
    end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle('x',0), end_prob_batch_posi.shape[0], axis=0).flatten()
    end_loss_rank = T.mean(T.maximum(0.0, margin-end_repeat_posi+end_repeat_nega))






    loss = loss_neg_likelihood +end_loss_neg_likelihood+loss_rank+end_loss_rank

    #test
    test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, norm_HL_1_para,norm_HL_2_para,norm_HL_3_para,norm_HL_4_para,norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len)
    mask_test_return=T.argmax(test_span_scores_matrix*para_mask, axis=1) #batch

    end_test_span_scores_matrix = add_HLs_2_tensor3(test_input4score, end_norm_HL_1_para,end_norm_HL_2_para,end_norm_HL_3_para,end_norm_HL_4_para,end_norm_U_a, true_batch_size,true_p_len) #(batch, test_p_len)
    end_mask_test_return=T.argmax(end_test_span_scores_matrix*para_mask, axis=1) #batch



    params = [embeddings,char_embeddings]+NN_para+[U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para]+[end_U_a,end_HL_1_para,end_HL_2_para,end_HL_3_para,end_HL_4_para]

    L2_reg =L2norm_paraList([embeddings,char_embeddings,conv_W_1,conv_W_2,conv_W_1_q, conv_W_2_q, conv_W_char,U_a,HL_1_para,HL_2_para,HL_3_para,HL_4_para])
    #L2_reg = L2norm_paraList(params)
    cost=loss#+L2_weight*L2_reg


    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask,    char_paragraph, #(batch, char_len*p_len)
        char_questions, char_para_mask, char_q_mask, true_p_len], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([paragraph, questions,para_mask, q_mask,
        char_paragraph,
        char_questions,
        char_para_mask,
        char_q_mask,
                true_p_len], [mask_test_return,end_mask_test_return], on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/test_batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size]


    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_ids[para_id:para_id+batch_size]
            cost_i+= train_model(
                                 train_para_list[train_id_batch],
                                 train_Q_list[train_id_batch],
                                 train_label_list[train_id_batch],
                                 train_para_mask[train_id_batch],
                                 train_Q_mask[train_id_batch],
                                 train_para_char_list[train_id_batch],
                                 train_Q_char_list[train_id_batch],
                                 train_para_char_mask[train_id_batch],
                                 train_Q_char_mask[train_id_batch],
                                 p_len_limit)


            #print iter
            if iter%100==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                pred_dict={}
                q_amount=0
                p1=0
                for test_para_id in test_batch_start:
                    batch_predict_ids, batch_predict_end_ids=test_model(
                                                 test_para_list[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_list[test_para_id:test_para_id+test_batch_size],
                                                 test_para_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_para_char_list[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_char_list[test_para_id:test_para_id+test_batch_size],
                                                 test_para_char_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_char_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_p_len_limit)
                    test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size]
#                     test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size]
#                     q_amount+=test_batch_size
                    q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size]
                    q_amount+=test_batch_size

                    for q in range(test_batch_size): #for each question
#                         pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q])

                        start = batch_predict_ids[q]
                        end = batch_predict_end_ids[q]
                        if end < start:
                            start, end = end, start
                        pred_ans = ' '.join(test_para_wordlist_batch[q][start:end+1])
                        q_id=q_ids_batch[q]
                        pred_dict[q_id]=pred_ans
                with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt')
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
#                     if max_exact_acc > max_EM:
#                         store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params)
#                         print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc






            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=500, test_batch_size=1000, emb_size=300, hidden_size=300, HL_hidden_size=200,
                    L2_weight=0.0001, train_size=None, test_size=None, batch_size_pred=1000,
                    para_len=60, question_len=20, c_len=7, e_len=2):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/work/hs/yin/20161219/';
    storePath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = np.random.RandomState(23455)
    
    word2id={}
    word2id['UNK']=0 # use it to pad 
    word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_e_ids,train_e_masks,train_c_ids,train_c_masks, train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len, e_len, c_len, word2id, rootPath+'squadnewtrn.txt')
    word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_e_ids,test_e_masks,test_c_ids,test_c_masks, test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, e_len, c_len,word2id, rootPath+'squadnewdev.txt')

    print 'word2id size for bigger dataset:', len(word2id)
    word2id, train_questions,train_questions_mask,train_paras,train_paras_mask,train_e_ids,train_e_masks,train_c_ids,train_c_masks, train_c_heads,train_c_tails,train_l_heads,train_l_tails,train_e_heads,train_e_tails,train_labels, train_labels_3c=load_SQUAD_hinrich_v2(train_size, para_len, question_len,e_len, c_len, word2id, rootPath+'squadnewtrn,subset.000.txt')
    word2id, test_questions,test_questions_mask,test_paras,test_paras_mask,test_e_ids,test_e_masks,test_c_ids,test_c_masks, test_c_heads,test_c_tails,test_l_heads,test_l_tails,test_e_heads,test_e_tails,test_labels, test_labels_3c=load_SQUAD_hinrich_v2(test_size, para_len, question_len, e_len, c_len,word2id, rootPath+'squadnewdev,subset.000.txt')
    
    print 'word2id size for smaller dataset:', len(word2id)
#     if len(train_questions)!=train_size or len(test_questions)!=test_size:
#         print 'len(questions)!=train_size or len(test_questions)!=test_size:', len(train_questions),train_size,len(test_questions),test_size
#         exit(0)
    train_size=len(train_questions)
    test_size = len(test_questions)
    
    train_questions = np.asarray(train_questions, dtype='int32')
    
#     print train_questions[:10,:]
#     exit(0)
    train_questions_mask = np.asarray(train_questions_mask, dtype=theano.config.floatX)
    train_paras = np.asarray(train_paras, dtype='int32')
    train_paras_mask = np.asarray(train_paras_mask, dtype=theano.config.floatX)

    train_e_ids = np.asarray(train_e_ids, dtype='int32')
    train_e_masks = np.asarray(train_e_masks, dtype=theano.config.floatX)
    train_c_ids = np.asarray(train_c_ids, dtype='int32')
    train_c_masks = np.asarray(train_c_masks, dtype=theano.config.floatX)

    train_c_heads = np.asarray(train_c_heads, dtype='int32')
    train_c_tails = np.asarray(train_c_tails, dtype='int32')
    train_l_heads = np.asarray(train_l_heads, dtype='int32')
    train_l_tails = np.asarray(train_l_tails, dtype='int32')
    train_e_heads = np.asarray(train_e_heads, dtype='int32')
    train_e_tails = np.asarray(train_e_tails, dtype='int32')
    train_labels = np.asarray(train_labels, dtype='int32')
    train_labels_3c = np.asarray(train_labels_3c, dtype='int32')

    test_questions = np.asarray(test_questions, dtype='int32')
    test_questions_mask = np.asarray(test_questions_mask, dtype=theano.config.floatX)
    test_paras = np.asarray(test_paras, dtype='int32')
    test_paras_mask = np.asarray(test_paras_mask, dtype=theano.config.floatX)

    test_e_ids = np.asarray(test_e_ids, dtype='int32')
    test_e_masks = np.asarray(test_e_masks, dtype=theano.config.floatX)
    test_c_ids = np.asarray(test_c_ids, dtype='int32')
    test_c_masks = np.asarray(test_c_masks, dtype=theano.config.floatX)

    test_c_heads = np.asarray(test_c_heads, dtype='int32')
    test_c_tails = np.asarray(test_c_tails, dtype='int32')
    test_l_heads = np.asarray(test_l_heads, dtype='int32')
    test_l_tails = np.asarray(test_l_tails, dtype='int32')
    test_e_heads = np.asarray(test_e_heads, dtype='int32')
    test_e_tails = np.asarray(test_e_tails, dtype='int32')
    test_labels = np.asarray(test_labels, dtype='int32')

    overall_vocab_size=len(word2id)
    print 'train size:', train_size, 'test size:', test_size, 'vocab size:', overall_vocab_size


    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, rng)
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()

    para=T.imatrix()  #(2*batch, len)
    para_mask=T.fmatrix() #(2*batch, len)

    c_ids=T.imatrix()  #(2*batch, len)
    c_mask=T.fmatrix() #(2*batch, len)
    e_ids=T.imatrix()  #(2*batch, len)
    e_mask=T.fmatrix() #(2*batch, len)

    c_heads=T.ivector() #batch
    c_tails=T.ivector() #batch
    l_heads=T.ivector() #batch
    l_tails=T.ivector() #batch
    e_heads=T.ivector() #batch
    e_tails=T.ivector() #batch
    q=T.imatrix()  #(2*batch, len_q)
    q_mask=T.fmatrix() #(2*batch, len_q)
    labels=T.ivector() #batch





    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    true_batch_size = para.shape[0]

#     U_p, W_p, b_p=create_GRU_para(rng, emb_size, hidden_size)
#     U_p_b, W_p_b, b_p_b=create_GRU_para(rng, emb_size, hidden_size)
#     GRU_p_para=[U_p, W_p, b_p, U_p_b, W_p_b, b_p_b]
#     
#     U_q, W_q, b_q=create_GRU_para(rng, emb_size, hidden_size)
#     U_q_b, W_q_b, b_q_b=create_GRU_para(rng, emb_size, hidden_size)
#     GRU_q_para=[U_q, W_q, b_q, U_q_b, W_q_b, b_q_b]
    
    paragraph_input = embeddings[para.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len)
    q_input = embeddings[q.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len)


    fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
    paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
    paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
    paragraph_reps_tensor3=paragraph_model.output_tensor #(batch, 2*hidden, paralen)

#     paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U_p,W=W_p,b=b_p,Ub=U_p_b,Wb=W_p_b,bb=b_p_b)
#     paragraph_reps_tensor3=paragraph_model.output_tensor_conc #(batch, 2*hidden, para_len)


    fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
    question_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
    questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(q_input, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
    q_reps=questions_model.output_sent_rep_maxpooling #(batch, 2*hidden)

#     q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U_q,W=W_q,b=b_q,Ub=U_q_b,Wb=W_q_b,bb=b_q_b)
#     q_reps=q_model.output_sent_rep_conc #(batch, 2*hidden)

    #interaction
    batch_ids=T.arange(true_batch_size)
    c_heads_reps=paragraph_reps_tensor3[batch_ids,:,c_heads] #(batch, 2*hidden)
    c_tails_reps=paragraph_reps_tensor3[batch_ids,:,c_tails] #(batch, 2*hidden)
    candididates_reps=T.concatenate([c_heads_reps, c_tails_reps], axis=1) #(batch, 4*hidden)

    l_heads_reps=paragraph_reps_tensor3[batch_ids,:,l_heads] #(batch, 2*hidden)
    l_tails_reps=paragraph_reps_tensor3[batch_ids,:,l_tails] #(batch, 2*hidden)
    longs_reps=T.concatenate([l_heads_reps, l_tails_reps], axis=1) #(batch, 4*hidden)

    e_heads_reps=paragraph_reps_tensor3[batch_ids,:,e_heads] #(batch, 2*hidden)
    e_tails_reps=paragraph_reps_tensor3[batch_ids,:,e_tails] #(batch, 2*hidden)
    extensions_reps=T.concatenate([e_heads_reps, e_tails_reps], axis=1) #(batch, 4*hidden)
    
    
    #glove level average
    c_input = embeddings[c_ids.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len)
    c_sum = T.sum(c_input*c_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size)
    average_C_batch = c_sum/T.sqrt(T.sum(c_sum**2, axis=1)+1e-20).dimshuffle(0,'x')

    e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len)
    e_sum = T.sum(e_input*e_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size)
    average_E_batch = e_sum/T.sqrt(T.sum(e_sum**2, axis=1)+1e-20).dimshuffle(0,'x')    

#     e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len)
    q_sum = T.sum(q_input*q_mask.dimshuffle(0,'x',1), axis=2) #(batch, emb_size)
    average_Q_batch = q_sum/T.sqrt(T.sum(q_sum**2, axis=1)+1e-20).dimshuffle(0,'x')      
#     def submatrix_average(matrix, head, tail):
#         return T.mean(matrix[:, head:tail+1], axis=1) #emb_size
#     def submatrix_average_q(matrix, head):
#         return T.mean(matrix[:, head:], axis=1) #emb_size
#     
#     average_E_batch, _ = theano.scan(fn=submatrix_average,
#                                    sequences=[paragraph_input,e_heads, e_tails])    #(batch, emb_size)
#     average_C_batch, _ = theano.scan(fn=submatrix_average,
#                                    sequences=[paragraph_input,c_heads, c_tails])  #(batch, emb_size)
#     
#     Q_valid_len=T.cast(T.sum(q_mask, axis=1), 'int32')
#     
#     average_Q_batch, _ = theano.scan(fn=submatrix_average_q,
#                                    sequences=[q_input,-Q_valid_len])     #(batch, emb_size)
    #classify




    HL_layer_subtask_input=T.concatenate([q_reps, extensions_reps, average_E_batch, average_Q_batch], axis=1) #(batch, 6*hidden+2*emb)
    HL_layer_subtask_size= 6*hidden_size+2*emb_size#HL_layer_1_input_size+2*HL_hidden_size

    HL_layer_subtask_1=HiddenLayer(rng, input=HL_layer_subtask_input, n_in=HL_layer_subtask_size, n_out=HL_hidden_size, activation=T.tanh)
    HL_layer_subtask_2=HiddenLayer(rng, input=HL_layer_subtask_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh)        
    U_subtask_a = create_ensemble_para(rng, 2, HL_hidden_size) # the weight matrix hidden_size*2
    norm_U_subtask_a=normalize_matrix(U_subtask_a)
    LR_subtask_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class  
    LR_subtask_para=[U_subtask_a, LR_subtask_b]
    layer_LR_subtask=LogisticRegression(rng, input=HL_layer_subtask_2.output, n_in=HL_hidden_size, n_out=2, W=norm_U_subtask_a, b=LR_subtask_b) #basically it is a multiplication between weight matrix and input feature vector
    

    HL_layer_1_input_size=14*hidden_size+3*emb_size+1
    #, average_E_batch, average_C_batch, average_Q_batch
    HL_layer_1_input = T.concatenate([q_reps, longs_reps, extensions_reps, candididates_reps, average_E_batch, average_C_batch, average_Q_batch, layer_LR_subtask.prop_for_posi.reshape((true_batch_size,1))], axis=1) #(batch, 14*hidden_size+3*emb_size+1)
    
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=HL_hidden_size, activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh)
        
    LR_input=HL_layer_2.output #T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output], axis=1) #(batch, 10*hidden)
    LR_input_size= HL_hidden_size#HL_layer_1_input_size+2*HL_hidden_size
    U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
    norm_U_a=normalize_matrix(U_a)
    LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class  
    LR_para=[U_a, LR_b]
    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=norm_U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)+layer_LR_subtask.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    






    params = LR_para+[embeddings]+paragraph_para+question_para+HL_layer_1.params+HL_layer_2.params+LR_subtask_para+HL_layer_subtask_1.params+HL_layer_subtask_2.params
    
#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=loss#+0.0005*T.mean(U_a**2)


    accumulator=[]
    for para_i in params:
        eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20)))   #AdaGrad
        updates.append((acc_i, acc))


    train_model = theano.function([para, para_mask,c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], cost, updates=updates,on_unused_input='ignore')

    train_model_pred = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], layer_LR.y_pred, on_unused_input='ignore')


    test_model = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], [layer_LR.errors(labels),layer_LR.y_pred], on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size    #batch_size means how many pairs
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] 

    n_train_batches_pred=train_size/batch_size_pred    #batch_size means how many pairs
    train_batch_start_pred=list(np.arange(n_train_batches_pred)*batch_size_pred)+[train_size-batch_size_pred] 

    n_test_batches=test_size/test_batch_size    #batch_size means how many pairs
    test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size]




    max_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
    train_ids_pred = range(train_size)
    best_test_statistic=defaultdict(int)
#     best_train_statistic=defaultdict(int)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        random.shuffle(train_ids)
#         print train_ids[:100]
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1

            iter_accu+=1
            train_id_list = train_ids[para_id:para_id+batch_size]
#             print 'train_labels[train_id_list]:', train_labels[train_id_list]
            cost_i+= train_model(
                                train_paras[train_id_list],
                                train_paras_mask[train_id_list],
                                
                                train_c_ids[train_id_list],
                                train_c_masks[train_id_list],
                                train_e_ids[train_id_list],
                                train_e_masks[train_id_list],
                                
                                train_c_heads[train_id_list],
                                train_c_tails[train_id_list],
                                train_l_heads[train_id_list],
                                train_l_tails[train_id_list],
                                train_e_heads[train_id_list],
                                train_e_tails[train_id_list],
                                train_questions[train_id_list],
                                train_questions_mask[train_id_list],
                                train_labels[train_id_list])

            #print iter
            if  iter%10==0: #iter>=200 and
                print 'Epoch ', epoch, 'iter '+str(iter)+'/'+str(len(train_batch_start))+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'

                past_time = time.time()
#                 print 'Training Pred...'
#                 train_statistic=defaultdict(int)
#                 for para_id in train_batch_start_pred:
#                     train_id_list = train_ids_pred[para_id:para_id+batch_size_pred]
#                     gold_train_labels_list = train_labels_3c[train_id_list]
# #                     print 'train_id_list:', train_id_list
# #                     print 'train_c_heads[train_id_list]:', train_c_heads[train_id_list]
#                     train_preds_i= train_model_pred(
#                                         train_paras[train_id_list],
#                                         train_paras_mask[train_id_list],
#                                         train_c_ids[train_id_list],
#                                         train_c_masks[train_id_list],
#                                         train_e_ids[train_id_list],
#                                         train_e_masks[train_id_list],
#                                         train_c_heads[train_id_list],
#                                         train_c_tails[train_id_list],
#                                         train_l_heads[train_id_list],
#                                         train_l_tails[train_id_list],
#                                         train_e_heads[train_id_list],
#                                         train_e_tails[train_id_list],
#                                         train_questions[train_id_list],
#                                         train_questions_mask[train_id_list],
#                                         train_labels[train_id_list])  
# 
#                     for ind, gold_label in enumerate(gold_train_labels_list):
#                         train_statistic[(gold_label, train_preds_i[ind])]+=1   
#                     train_acc= (train_statistic.get((1,1),0)+train_statistic.get((0,0),0))*1.0/(train_statistic.get((1,1),0)+train_statistic.get((0,0),0)+train_statistic.get((1,0),0)+train_statistic.get((0,1),0))
#                             
#                 print '\t\tcurrnt train acc:', train_acc, ' train_statistic:', train_statistic
                print 'Testing...'
                error=0
                test_statistic=defaultdict(int)
                for test_para_id in test_batch_start:
                    test_id_list = range(test_para_id, test_para_id+test_batch_size)   
#                     print 'test_id_list:',test_id_list    
#                     print 'test_c_heads[test_id_list]', test_c_heads[test_id_list]
                    gold_labels_list = test_labels_3c[test_para_id:test_para_id+test_batch_size]
                    error_i, preds_i= test_model(
                                        test_paras[test_id_list],
                                        test_paras_mask[test_id_list],
                                        test_c_ids[test_id_list],
                                        test_c_masks[test_id_list],
                                        test_e_ids[test_id_list],
                                        test_e_masks[test_id_list],
                                        test_c_heads[test_id_list],
                                        test_c_tails[test_id_list],
                                        test_l_heads[test_id_list],
                                        test_l_tails[test_id_list],
                                        test_e_heads[test_id_list],
                                        test_e_tails[test_id_list],
                                        test_questions[test_id_list],
                                        test_questions_mask[test_id_list],
                                        test_labels[test_id_list])

                    error+=error_i
                    for ind, gold_label in enumerate(gold_labels_list):
                        test_statistic[(gold_label, preds_i[ind])]+=1
#                 acc=1.0-error*1.0/len(test_batch_start)
                acc= (test_statistic.get((1,1),0)+test_statistic.get((0,0),0))*1.0/(test_statistic.get((1,1),0)+test_statistic.get((0,0),0)+test_statistic.get((1,0),0)+test_statistic.get((0,1),0))
                
                if acc> max_acc:
                    max_acc=acc
                    best_test_statistic=test_statistic
                    store_model_to_file(storePath+'Best_Paras_HS_v2_000_subtask_'+str(max_acc), params)
                    print 'Finished storing best  params at:', max_acc
                print 'current average acc:', acc, '\t\tmax acc:', max_acc, '\ttest_statistic:', test_statistic
                print '\t\t\t\tbest statistic:', best_test_statistic




            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #6
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=2000,
                    batch_size=100,
                    emb_size=300,
                    char_emb_size=20,
                    hidden_size=300,
                    L2_weight=0.0001,
                    p_len_limit=400,
                    test_p_len_limit=100,
                    q_len_limit=20,
                    char_len=15,
                    filter_size=[5, 5],
                    char_filter_size=3,
                    margin=2.0,
                    max_EM=50.302743615):
    test_batch_size = batch_size * 10
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)

    word2id = {}
    char2id = {}
    #questions,paragraphs,q_masks,p_masks,labels, word2id
    train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_label_list, word2id, char2id = load_squad_cnn_rank_word_train(
        word2id, char2id, p_len_limit, q_len_limit, char_len)
    train_size = len(train_para_list)

    test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, test_label_list, q_idlist, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_word_dev(
        word2id, char2id, test_p_len_limit, q_len_limit, char_len)
    test_size = len(test_para_list)

    train_Q_list = numpy.asarray(train_Q_list, dtype='int32')
    train_para_list = numpy.asarray(train_para_list, dtype='int32')
    train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX)
    train_para_mask = numpy.asarray(train_para_mask,
                                    dtype=theano.config.floatX)

    train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32')
    train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32')
    train_Q_char_mask = numpy.asarray(train_Q_char_mask,
                                      dtype=theano.config.floatX)
    train_para_char_mask = numpy.asarray(train_para_char_mask,
                                         dtype=theano.config.floatX)

    train_label_list = numpy.asarray(train_label_list, dtype='int32')

    test_Q_list = numpy.asarray(test_Q_list, dtype='int32')
    test_para_list = numpy.asarray(test_para_list, dtype='int32')
    test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX)
    test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX)

    test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32')
    test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32')
    test_Q_char_mask = numpy.asarray(test_Q_char_mask,
                                     dtype=theano.config.floatX)
    test_para_char_mask = numpy.asarray(test_para_char_mask,
                                        dtype=theano.config.floatX)

    vocab_size = len(word2id)
    print 'vocab size: ', vocab_size
    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX, rng)
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_glove()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    char_size = len(char2id)
    print 'char size: ', char_size
    char_rand_values = random_value_normal((char_size + 1, char_emb_size),
                                           theano.config.floatX, rng)
    char_embeddings = theano.shared(value=char_rand_values, borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    gold_indices = T.imatrix()  #batch, (start, end) for each sample
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')

    char_paragraph = T.imatrix()  #(batch, char_len*p_len)
    char_questions = T.imatrix()
    char_para_mask = T.fmatrix()
    char_q_mask = T.fmatrix()

    true_p_len = T.iscalar()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size = paragraph.shape[0]

    common_input_p = embeddings[paragraph.flatten()].reshape(
        (true_batch_size, true_p_len,
         emb_size))  #the input format can be adapted into CNN or GRU or LSTM
    common_input_q = embeddings[questions.flatten()].reshape(
        (true_batch_size, q_len_limit, emb_size))

    char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape(
        (true_batch_size * true_p_len, char_len, char_emb_size
         ))  #the input format can be adapted into CNN or GRU or LSTM
    char_common_input_q = char_embeddings[char_questions.flatten()].reshape(
        (true_batch_size * q_len_limit, char_len, char_emb_size))

    char_p_masks = char_para_mask.reshape(
        (true_batch_size * true_p_len, char_len))
    char_q_masks = char_q_mask.reshape(
        (true_batch_size * q_len_limit, char_len))

    conv_W_char, conv_b_char = create_conv_para(
        rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size))
    conv_W_1, conv_b_1 = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1, emb_size + char_emb_size,
                      filter_size[0]))
    conv_W_2, conv_b_2 = create_conv_para(rng,
                                          filter_shape=(hidden_size, 1,
                                                        hidden_size,
                                                        filter_size[1]))

    conv_W_1_q, conv_b_1_q = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1, emb_size + char_emb_size,
                      filter_size[0]))
    conv_W_2_q, conv_b_2_q = create_conv_para(rng,
                                              filter_shape=(hidden_size, 1,
                                                            hidden_size,
                                                            filter_size[1]))
    NN_para = [
        conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q,
        conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char
    ]

    input4score = squad_cnn_rank_word(
        rng, common_input_p, common_input_q, char_common_input_p,
        char_common_input_q, batch_size, p_len_limit, q_len_limit, emb_size,
        char_emb_size, char_len, filter_size, char_filter_size, hidden_size,
        conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q, conv_b_1_q,
        conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char, para_mask, q_mask,
        char_p_masks, char_q_masks)  #(batch, 4*hidden, p_len_limit)

    test_input4score = squad_cnn_rank_word(
        rng, common_input_p, common_input_q, char_common_input_p,
        char_common_input_q, test_batch_size, test_p_len_limit, q_len_limit,
        emb_size, char_emb_size, char_len, filter_size, char_filter_size,
        hidden_size, conv_W_1, conv_b_1, conv_W_2, conv_b_2, conv_W_1_q,
        conv_b_1_q, conv_W_2_q, conv_b_2_q, conv_W_char, conv_b_char,
        para_mask, q_mask, char_p_masks,
        char_q_masks)  #(batch, 4*hidden, p_len_limit)

    # gram_size = 5*true_p_len-(0+1+2+3+4)

    HL_1_para = create_ensemble_para(rng, hidden_size,
                                     6 * hidden_size + char_emb_size)
    HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 1, hidden_size)
    norm_U_a = normalize_matrix(U_a)
    norm_HL_1_para = normalize_matrix(HL_1_para)
    norm_HL_2_para = normalize_matrix(HL_2_para)
    norm_HL_3_para = normalize_matrix(HL_3_para)
    norm_HL_4_para = normalize_matrix(HL_4_para)

    end_HL_1_para = create_ensemble_para(rng, hidden_size,
                                         6 * hidden_size + char_emb_size)
    end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_U_a = create_ensemble_para(rng, 1, hidden_size)
    end_norm_U_a = normalize_matrix(end_U_a)
    end_norm_HL_1_para = normalize_matrix(end_HL_1_para)
    end_norm_HL_2_para = normalize_matrix(end_HL_2_para)
    end_norm_HL_3_para = normalize_matrix(end_HL_3_para)
    end_norm_HL_4_para = normalize_matrix(end_HL_4_para)

    span_scores_matrix = add_HLs_2_tensor3(input4score, norm_HL_1_para,
                                           norm_HL_2_para, norm_HL_3_para,
                                           norm_HL_4_para, norm_U_a,
                                           batch_size, true_p_len)
    span_scores = T.nnet.softmax(span_scores_matrix)  #(batch, para_len)
    end_span_scores_matrix = add_HLs_2_tensor3(input4score, end_norm_HL_1_para,
                                               end_norm_HL_2_para,
                                               end_norm_HL_3_para,
                                               end_norm_HL_4_para,
                                               end_norm_U_a, batch_size,
                                               true_p_len)
    end_span_scores = T.nnet.softmax(
        end_span_scores_matrix)  #(batch, para_len)
    loss_neg_likelihood = -T.mean(
        T.log(span_scores[T.arange(batch_size), gold_indices[:, 0]]))
    end_loss_neg_likelihood = -T.mean(
        T.log(span_scores[T.arange(batch_size), gold_indices[:, 1]]))

    #ranking loss start
    tanh_span_scores_matrix = span_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)
    index_matrix = T.zeros((batch_size, p_len_limit),
                           dtype=theano.config.floatX)
    new_index_matrix = T.set_subtensor(
        index_matrix[T.arange(batch_size), gold_indices[:, 0]], 1.0)
    prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()]
    prob_batch_nega = tanh_span_scores_matrix[(1.0 -
                                               new_index_matrix).nonzero()]
    repeat_posi = T.extra_ops.repeat(prob_batch_posi,
                                     prob_batch_nega.shape[0],
                                     axis=0)
    repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0),
                                     prob_batch_posi.shape[0],
                                     axis=0).flatten()
    loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega))

    #ranking loss END
    end_tanh_span_scores_matrix = end_span_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)
    end_index_matrix = T.zeros((batch_size, p_len_limit),
                               dtype=theano.config.floatX)
    end_new_index_matrix = T.set_subtensor(
        end_index_matrix[T.arange(batch_size), gold_indices[:, 1]], 1.0)
    end_prob_batch_posi = end_tanh_span_scores_matrix[
        end_new_index_matrix.nonzero()]
    end_prob_batch_nega = end_tanh_span_scores_matrix[(
        1.0 - end_new_index_matrix).nonzero()]
    end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi,
                                         end_prob_batch_nega.shape[0],
                                         axis=0)
    end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle(
        'x', 0),
                                         end_prob_batch_posi.shape[0],
                                         axis=0).flatten()
    end_loss_rank = T.mean(
        T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega))

    loss = loss_neg_likelihood + end_loss_neg_likelihood + loss_rank + end_loss_rank

    #test
    test_span_scores_matrix = add_HLs_2_tensor3(
        test_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para,
        norm_HL_4_para, norm_U_a, true_batch_size,
        true_p_len)  #(batch, test_p_len)
    mask_test_return = T.argmax(test_span_scores_matrix * para_mask,
                                axis=1)  #batch

    end_test_span_scores_matrix = add_HLs_2_tensor3(
        test_input4score, end_norm_HL_1_para, end_norm_HL_2_para,
        end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size,
        true_p_len)  #(batch, test_p_len)
    end_mask_test_return = T.argmax(end_test_span_scores_matrix * para_mask,
                                    axis=1)  #batch

    params = (
        [embeddings, char_embeddings] + NN_para +
        [U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] +
        [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para])

    L2_reg = L2norm_paraList([
        embeddings, char_embeddings, conv_W_1, conv_W_2, conv_W_1_q,
        conv_W_2_q, conv_W_char, U_a, HL_1_para, HL_2_para, HL_3_para,
        HL_4_para
    ])
    #L2_reg = L2norm_paraList(params)
    cost = loss + L2_weight * L2_reg

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function(
        [
            paragraph,
            questions,
            gold_indices,
            para_mask,
            q_mask,
            char_paragraph,  #(batch, char_len*p_len)
            char_questions,
            char_para_mask,
            char_q_mask,
            true_p_len
        ],
        cost,
        updates=updates,
        on_unused_input='ignore')

    test_model = theano.function([
        paragraph, questions, para_mask, q_mask, char_paragraph,
        char_questions, char_para_mask, char_q_mask, true_p_len
    ], [mask_test_return, end_mask_test_return],
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / test_batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(numpy.arange(n_test_batches) *
                            test_batch_size) + [test_size - test_batch_size]

    max_F1_acc = 0.0
    max_exact_acc = 0.0
    cost_i = 0.0
    train_ids = range(train_size)

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.Random(4).shuffle(train_ids)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_ids[para_id:para_id + batch_size]
            cost_i += train_model(
                train_para_list[train_id_batch], train_Q_list[train_id_batch],
                train_label_list[train_id_batch],
                train_para_mask[train_id_batch], train_Q_mask[train_id_batch],
                train_para_char_list[train_id_batch],
                train_Q_char_list[train_id_batch],
                train_para_char_mask[train_id_batch],
                train_Q_char_mask[train_id_batch], p_len_limit)

            #print iter
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                pred_dict = {}
                q_amount = 0
                p1 = 0
                for test_para_id in test_batch_start:
                    batch_predict_ids, batch_predict_end_ids = test_model(
                        test_para_list[test_para_id:test_para_id +
                                       test_batch_size],
                        test_Q_list[test_para_id:test_para_id +
                                    test_batch_size],
                        test_para_mask[test_para_id:test_para_id +
                                       test_batch_size],
                        test_Q_mask[test_para_id:test_para_id +
                                    test_batch_size],
                        test_para_char_list[test_para_id:test_para_id +
                                            test_batch_size],
                        test_Q_char_list[test_para_id:test_para_id +
                                         test_batch_size],
                        test_para_char_mask[test_para_id:test_para_id +
                                            test_batch_size],
                        test_Q_char_mask[test_para_id:test_para_id +
                                         test_batch_size], test_p_len_limit)
                    test_para_wordlist_batch = test_para_wordlist_list[
                        test_para_id:test_para_id + test_batch_size]
                    #                     test_label_batch=test_label_list[test_para_id:test_para_id+test_batch_size]
                    #                     q_amount+=test_batch_size
                    q_ids_batch = q_idlist[test_para_id:test_para_id +
                                           test_batch_size]
                    q_amount += test_batch_size

                    for q in range(test_batch_size):  #for each question
                        #                         pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q])

                        start = batch_predict_ids[q]
                        end = batch_predict_end_ids[q]
                        if end < start:
                            start, end = end, start
                        pred_ans = ' '.join(
                            test_para_wordlist_batch[q][start:end + 1])
                        q_id = q_ids_batch[q]
                        pred_dict[q_id] = pred_ans
                with codecs.open(rootPath + 'predictions.txt', 'w',
                                 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json',
                                                  rootPath + 'predictions.txt')
                if F1_acc > max_F1_acc:
                    max_F1_acc = F1_acc
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc


#                     if max_exact_acc > max_EM:
#                         store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params)
#                         print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=100,
                    batch_size=100,
                    emb_size=300,
                    char_emb_size=20,
                    hidden_size=10,
                    L2_weight=0.0001,
                    p_len_limit=400,
                    test_p_len_limit=100,
                    q_len_limit=20,
                    char_len=15,
                    filter_size=[5, 5, 5, 5, 5],
                    char_filter_size=5,
                    margin=0.85,
                    extra_size=5 + 11,
                    extra_emb=10,
                    distance=10,
                    distance_emb=10,
                    comment='add distance embs'):  #extra_size=3+46+7
    test_batch_size = batch_size
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)

    word2id = {}
    char2id = {}

    train_Q_list, train_para_list, train_Q_mask, train_para_mask, train_Q_char_list, train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras, word2id, char2id = load_squad_cnn_rank_span_word_train(
        word2id, char2id, p_len_limit, q_len_limit, char_len)
    test_Q_list, test_para_list, test_Q_mask, test_para_mask, test_Q_char_list, test_para_char_list, test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = load_squad_cnn_rank_span_word_dev(
        word2id, char2id, test_p_len_limit, q_len_limit, char_len)
    '''
    #store variables into file
    '''
    #     train_variables = [train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras]
    #     test_variables =[test_Q_list, test_para_list,  test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list,  test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list]
    #     with open(rootPath+'extra.3.pickle', 'wb') as f:  # Python 3: open(..., 'wb')
    #         cPickle.dump(train_variables+test_variables, f, protocol=cPickle.HIGHEST_PROTOCOL)
    #     f.close()
    #     print 'variable stored successfully'
    #     exit(0)
    '''
    load variables from file
    '''
    #     before_load_time = time.time()
    #     with open(rootPath+'extra.3.pickle', 'rb') as f:  # Python 3: open(..., 'rb')
    #         train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_Q_char_list,train_para_char_list, train_Q_char_mask, train_para_char_mask, train_span_label_list, train_word_label_list, train_para_extras,test_Q_list, test_para_list,  test_Q_mask, test_para_mask,test_Q_char_list, test_para_char_list,  test_Q_char_mask, test_para_char_mask, q_idlist, test_para_extras, word2id, char2id, test_para_wordlist_list = cPickle.load(f)
    #     f.close()
    #     print 'load data variables successfully, spend: ',     (time.time()-before_load_time)/60.0, ' mins'
    train_size = len(train_para_list)
    test_size = len(test_para_list)

    train_Q_list = numpy.asarray(train_Q_list, dtype='int32')
    train_para_list = numpy.asarray(train_para_list, dtype='int32')
    train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX)
    train_para_mask = numpy.asarray(train_para_mask,
                                    dtype=theano.config.floatX)

    train_Q_char_list = numpy.asarray(train_Q_char_list, dtype='int32')
    train_para_char_list = numpy.asarray(train_para_char_list, dtype='int32')
    train_Q_char_mask = numpy.asarray(train_Q_char_mask,
                                      dtype=theano.config.floatX)
    train_para_char_mask = numpy.asarray(train_para_char_mask,
                                         dtype=theano.config.floatX)

    train_para_extras = numpy.asarray(train_para_extras,
                                      dtype=theano.config.floatX)

    train_span_label_list = numpy.asarray(train_span_label_list, dtype='int32')
    train_word_label_list = numpy.asarray(train_word_label_list, dtype='int32')

    test_Q_list = numpy.asarray(test_Q_list, dtype='int32')
    test_para_list = numpy.asarray(test_para_list, dtype='int32')
    test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX)
    test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX)

    test_Q_char_list = numpy.asarray(test_Q_char_list, dtype='int32')
    test_para_char_list = numpy.asarray(test_para_char_list, dtype='int32')
    test_Q_char_mask = numpy.asarray(test_Q_char_mask,
                                     dtype=theano.config.floatX)
    test_para_char_mask = numpy.asarray(test_para_char_mask,
                                        dtype=theano.config.floatX)

    test_para_extras = numpy.asarray(test_para_extras,
                                     dtype=theano.config.floatX)

    vocab_size = len(word2id)
    print 'vocab size: ', vocab_size
    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX, rng)
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_glove()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    char_size = len(char2id)
    print 'char size: ', char_size
    char_rand_values = random_value_normal((char_size + 1, char_emb_size),
                                           theano.config.floatX, rng)
    char_embeddings = theano.shared(value=char_rand_values, borrow=True)

    extra_rand_values = random_value_normal((extra_size, extra_emb),
                                            theano.config.floatX, rng)
    extra_embeddings = theano.shared(value=extra_rand_values, borrow=True)

    distance_rand_values = random_value_normal(
        (2 * distance + 1, distance_emb), theano.config.floatX, rng)
    distance_embeddings = theano.shared(value=distance_rand_values,
                                        borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    span_indices = T.ivector()  #batch
    word_indices = T.imatrix()  #(batch, 2)
    ans_indices = T.ivector()  # for one batch, the length is dynamic
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')

    extra = T.ftensor3()  #(batch, p_len, 3)

    char_paragraph = T.imatrix()  #(batch, char_len*p_len)
    char_questions = T.imatrix()
    char_para_mask = T.fmatrix()
    char_q_mask = T.fmatrix()

    true_p_len = T.iscalar()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size = paragraph.shape[0]

    extra_rep_batch = T.concatenate(
        [extra.dot(extra_embeddings), extra],
        axis=2)  #(batch, p_len, extra_emb+extra_size)
    zero_pad = T.zeros((true_batch_size, 1, extra_emb + extra_size))
    left_context = T.concatenate([zero_pad, extra_rep_batch[:, :-1, :]],
                                 axis=1)  #(batch, p_len, extra_emb+extra_size)
    right_context = T.concatenate(
        [extra_rep_batch[:, 1:, :], zero_pad],
        axis=1)  #(batch, p_len, extra_emb+extra_size)

    left_context_2 = T.concatenate(
        [zero_pad, zero_pad, extra_rep_batch[:, :-2, :]],
        axis=1)  #(batch, p_len, extra_emb+extra_size)
    right_context_2 = T.concatenate(
        [extra_rep_batch[:, 2:, :], zero_pad, zero_pad],
        axis=1)  #(batch, p_len, extra_emb+extra_size)

    simi2left = T.sum(extra_rep_batch * left_context,
                      axis=2).dimshuffle(0, 1, 'x')  #(batch, p_len, 1)
    simi2right = T.sum(extra_rep_batch * right_context,
                       axis=2).dimshuffle(0, 1, 'x')  #(batch, p_len, 1)
    cos2left = cosine_tensor3(extra_rep_batch, left_context,
                              2).dimshuffle(0, 1, 'x')
    cos2right = cosine_tensor3(extra_rep_batch, right_context,
                               2).dimshuffle(0, 1, 'x')
    diff2left = extra_rep_batch - left_context
    diff2right = extra_rep_batch - right_context  #(batch, p_len, extra_emb+extra_size)
    extra_rep_batch = T.concatenate(
        [
            extra_rep_batch, left_context, right_context, left_context_2,
            right_context_2, diff2left, diff2right, simi2left, simi2right,
            cos2left, cos2right
        ],
        axis=2)  #batch, p_len, 7*(extra_emb+extra_size)+4)
    true_extra_size = 7 * (extra_emb + extra_size) + 4

    common_input_p = embeddings[paragraph.flatten()].reshape(
        (true_batch_size, true_p_len,
         emb_size))  #the input format can be adapted into CNN or GRU or LSTM
    common_input_q = embeddings[questions.flatten()].reshape(
        (true_batch_size, q_len_limit, emb_size))

    char_common_input_p = char_embeddings[char_paragraph.flatten()].reshape(
        (true_batch_size * true_p_len, char_len, char_emb_size
         ))  #the input format can be adapted into CNN or GRU or LSTM
    char_common_input_q = char_embeddings[char_questions.flatten()].reshape(
        (true_batch_size * q_len_limit, char_len, char_emb_size))

    char_p_masks = char_para_mask.reshape(
        (true_batch_size * true_p_len, char_len))
    char_q_masks = char_q_mask.reshape(
        (true_batch_size * q_len_limit, char_len))

    conv_W_char, conv_b_char = create_conv_para(
        rng, filter_shape=(char_emb_size, 1, char_emb_size, char_filter_size))
    conv_W_1, conv_b_1 = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1,
                      emb_size + char_emb_size + true_extra_size,
                      filter_size[0]))
    conv_W_2, conv_b_2 = create_conv_para(rng,
                                          filter_shape=(hidden_size, 1,
                                                        hidden_size,
                                                        filter_size[1]))
    conv_W_3, conv_b_3 = create_conv_para(rng,
                                          filter_shape=(hidden_size, 1,
                                                        hidden_size,
                                                        filter_size[2]))
    #     conv_W_4, conv_b_4=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3]))
    #     conv_W_5, conv_b_5=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4]))

    conv_W_1_q, conv_b_1_q = create_conv_para(
        rng,
        filter_shape=(hidden_size, 1, emb_size + char_emb_size,
                      filter_size[0]))
    conv_W_2_q, conv_b_2_q = create_conv_para(rng,
                                              filter_shape=(hidden_size, 1,
                                                            hidden_size,
                                                            filter_size[1]))
    conv_W_3_q, conv_b_3_q = create_conv_para(rng,
                                              filter_shape=(hidden_size, 1,
                                                            hidden_size,
                                                            filter_size[2]))
    #     conv_W_4_q, conv_b_4_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[3]))
    #     conv_W_5_q, conv_b_5_q=create_conv_para(rng, filter_shape=(hidden_size, 1, hidden_size, filter_size[4]))
    CNN_para = [
        conv_W_1,
        conv_b_1,
        conv_W_2,
        conv_b_2,
        conv_W_1_q,
        conv_b_1_q,
        conv_W_2_q,
        conv_b_2_q,
        conv_W_3,
        conv_b_3,
        conv_W_3_q,
        conv_b_3_q,
        #              conv_W_4, conv_b_4, conv_W_5, conv_b_5,conv_W_4_q, conv_b_4_q, conv_W_5_q, conv_b_5_q,
        conv_W_char,
        conv_b_char
    ]

    span_input4score, word_input4score, overall_span_hidden_size, overall_word_hidden_size = squad_cnn_rank_spans_word(
        rng,
        common_input_p,
        common_input_q,
        char_common_input_p,
        char_common_input_q,
        batch_size,
        p_len_limit,
        q_len_limit,
        emb_size,
        char_emb_size,
        char_len,
        filter_size,
        char_filter_size,
        hidden_size,
        conv_W_1,
        conv_b_1,
        conv_W_2,
        conv_b_2,
        conv_W_1_q,
        conv_b_1_q,
        conv_W_2_q,
        conv_b_2_q,
        conv_W_char,
        conv_b_char,
        conv_W_3,
        conv_b_3,
        conv_W_3_q,
        conv_b_3_q,
        #                         conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q,
        #                         conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q,
        para_mask,
        q_mask,
        char_p_masks,
        char_q_masks,
        extra_rep_batch,
        true_extra_size)

    test_span_input4score, test_word_input4score, _, _ = squad_cnn_rank_spans_word(
        rng,
        common_input_p,
        common_input_q,
        char_common_input_p,
        char_common_input_q,
        test_batch_size,
        test_p_len_limit,
        q_len_limit,
        emb_size,
        char_emb_size,
        char_len,
        filter_size,
        char_filter_size,
        hidden_size,
        conv_W_1,
        conv_b_1,
        conv_W_2,
        conv_b_2,
        conv_W_1_q,
        conv_b_1_q,
        conv_W_2_q,
        conv_b_2_q,
        conv_W_char,
        conv_b_char,
        conv_W_3,
        conv_b_3,
        conv_W_3_q,
        conv_b_3_q,
        #                         conv_W_4, conv_b_4, conv_W_4_q, conv_b_4_q,
        #                         conv_W_5, conv_b_5, conv_W_5_q, conv_b_5_q,
        para_mask,
        q_mask,
        char_p_masks,
        char_q_masks,
        extra_rep_batch,
        true_extra_size)  #(batch, hidden, gram_size)

    gram_size = 5 * true_p_len - (0 + 1 + 2 + 3 + 4)

    #     U_a = create_ensemble_para(rng, 1, 4*hidden_size)
    #     norm_U_a=normalize_matrix(U_a)
    #     span_scores_matrix=T.dot(span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((batch_size, gram_size))  #(batch, 13*para_len-78, 1)

    span_HL_1_para = create_ensemble_para(rng, hidden_size,
                                          overall_span_hidden_size)
    span_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    span_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    span_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    span_U_a = create_ensemble_para(rng, 1,
                                    hidden_size + overall_span_hidden_size)
    norm_span_U_a = normalize_matrix(span_U_a)
    norm_span_HL_1_para = normalize_matrix(span_HL_1_para)
    norm_span_HL_2_para = normalize_matrix(span_HL_2_para)
    norm_span_HL_3_para = normalize_matrix(span_HL_3_para)
    norm_span_HL_4_para = normalize_matrix(span_HL_4_para)

    span_scores_matrix = add_HLs_2_tensor3(span_input4score,
                                           norm_span_HL_1_para,
                                           norm_span_HL_2_para,
                                           norm_span_HL_3_para,
                                           norm_span_HL_4_para, norm_span_U_a,
                                           batch_size, gram_size)

    span_scores = T.nnet.softmax(span_scores_matrix)  #(batch, 7*para_len-21)
    loss_neg_likelihood = -T.mean(
        T.log(span_scores[T.arange(batch_size), span_indices]))

    #ranking loss
    tanh_span_scores_matrix = span_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)

    index_matrix = T.zeros((batch_size, gram_size), dtype=theano.config.floatX)
    new_index_matrix = T.set_subtensor(
        index_matrix[T.arange(batch_size), span_indices], 1.0)

    prob_batch_posi = tanh_span_scores_matrix[new_index_matrix.nonzero()]
    prob_batch_nega = tanh_span_scores_matrix[(1.0 -
                                               new_index_matrix).nonzero()]

    repeat_posi = T.extra_ops.repeat(prob_batch_posi,
                                     prob_batch_nega.shape[0],
                                     axis=0)
    repeat_nega = T.extra_ops.repeat(prob_batch_nega.dimshuffle('x', 0),
                                     prob_batch_posi.shape[0],
                                     axis=0).flatten()
    loss_rank = T.mean(T.maximum(0.0, margin - repeat_posi + repeat_nega))

    span_loss = loss_neg_likelihood + loss_rank

    #     test_span_scores_matrix=T.dot(test_span_input4score.dimshuffle(0,2,1), norm_U_a).reshape((true_batch_size, gram_size))  #(batch, 13*para_len-78)
    test_span_scores_matrix = add_HLs_2_tensor3(
        test_span_input4score, norm_span_HL_1_para, norm_span_HL_2_para,
        norm_span_HL_3_para, norm_span_HL_4_para, norm_span_U_a,
        true_batch_size, gram_size)

    #word
    HL_1_para = create_ensemble_para(rng, hidden_size,
                                     overall_word_hidden_size)
    HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    start_U_a = create_ensemble_para(rng, 1,
                                     hidden_size + overall_word_hidden_size)
    norm_start_U_a = normalize_matrix(start_U_a)
    norm_HL_1_para = normalize_matrix(HL_1_para)
    norm_HL_2_para = normalize_matrix(HL_2_para)
    norm_HL_3_para = normalize_matrix(HL_3_para)
    norm_HL_4_para = normalize_matrix(HL_4_para)

    end_HL_1_para = create_ensemble_para(
        rng, hidden_size, overall_word_hidden_size + distance_emb)
    end_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    end_U_a = create_ensemble_para(
        rng, 1, hidden_size + overall_word_hidden_size + distance_emb)
    end_norm_U_a = normalize_matrix(end_U_a)
    end_norm_HL_1_para = normalize_matrix(end_HL_1_para)
    end_norm_HL_2_para = normalize_matrix(end_HL_2_para)
    end_norm_HL_3_para = normalize_matrix(end_HL_3_para)
    end_norm_HL_4_para = normalize_matrix(end_HL_4_para)

    start_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_HL_1_para,
                                            norm_HL_2_para, norm_HL_3_para,
                                            norm_HL_4_para, norm_start_U_a,
                                            batch_size, true_p_len)
    start_scores = T.nnet.softmax(start_scores_matrix)  #(batch, para_len)
    '''
    forward start info to end prediction
    '''
    distance_matrix = word_indices[:, 0].dimshuffle(
        0, 'x') - T.arange(true_p_len).dimshuffle('x', 0)  #(batch, p_len)
    distance_trunc_matrix = T.maximum(
        -distance, T.minimum(distance,
                             distance_matrix)) + distance  #(batch, p_len)
    zero_distance_matrix = T.zeros(
        (true_batch_size * true_p_len, 2 * distance + 1))
    filled_distance_matrix = T.set_subtensor(
        zero_distance_matrix[T.arange(true_batch_size * true_p_len),
                             distance_trunc_matrix.flatten()], 1.0)
    filled_distance_tensor3 = filled_distance_matrix.reshape(
        (true_batch_size,
         true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle(
             0, 2, 1)  #(batch_size, distance_emb, p_len)

    end_word_input4score = T.concatenate(
        [word_input4score, filled_distance_tensor3],
        axis=1)  #(batch, +distance_emb, p_len)

    end_scores_matrix = add_HLs_2_tensor3(end_word_input4score,
                                          end_norm_HL_1_para,
                                          end_norm_HL_2_para,
                                          end_norm_HL_3_para,
                                          end_norm_HL_4_para, end_norm_U_a,
                                          batch_size, true_p_len)
    end_scores = T.nnet.softmax(end_scores_matrix)  #(batch, para_len)
    start_loss_neg_likelihood = -T.mean(
        T.log(start_scores[T.arange(batch_size), word_indices[:, 0]]))
    end_loss_neg_likelihood = -T.mean(
        T.log(end_scores[T.arange(batch_size), word_indices[:, 1]]))

    #ranking loss start
    tanh_start_scores_matrix = start_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)
    start_index_matrix = T.zeros((batch_size, p_len_limit),
                                 dtype=theano.config.floatX)
    start_new_index_matrix = T.set_subtensor(
        start_index_matrix[T.arange(batch_size), word_indices[:, 0]], 1.0)
    start_prob_batch_posi = tanh_start_scores_matrix[
        start_new_index_matrix.nonzero()]
    start_prob_batch_nega = tanh_start_scores_matrix[(
        1.0 - start_new_index_matrix).nonzero()]
    start_repeat_posi = T.extra_ops.repeat(start_prob_batch_posi,
                                           start_prob_batch_nega.shape[0],
                                           axis=0)
    start_repeat_nega = T.extra_ops.repeat(start_prob_batch_nega.dimshuffle(
        'x', 0),
                                           start_prob_batch_posi.shape[0],
                                           axis=0).flatten()
    start_loss_rank = T.mean(
        T.maximum(0.0, margin - start_repeat_posi + start_repeat_nega))

    #ranking loss END
    end_tanh_scores_matrix = end_scores  #T.tanh(span_scores_matrix) #(batch, gram_size)
    end_index_matrix = T.zeros((batch_size, p_len_limit),
                               dtype=theano.config.floatX)
    end_new_index_matrix = T.set_subtensor(
        end_index_matrix[T.arange(batch_size), word_indices[:, 1]], 1.0)
    end_prob_batch_posi = end_tanh_scores_matrix[
        end_new_index_matrix.nonzero()]
    end_prob_batch_nega = end_tanh_scores_matrix[(
        1.0 - end_new_index_matrix).nonzero()]
    end_repeat_posi = T.extra_ops.repeat(end_prob_batch_posi,
                                         end_prob_batch_nega.shape[0],
                                         axis=0)
    end_repeat_nega = T.extra_ops.repeat(end_prob_batch_nega.dimshuffle(
        'x', 0),
                                         end_prob_batch_posi.shape[0],
                                         axis=0).flatten()
    end_loss_rank = T.mean(
        T.maximum(0.0, margin - end_repeat_posi + end_repeat_nega))

    word_loss = start_loss_neg_likelihood + end_loss_neg_likelihood + start_loss_rank + end_loss_rank

    #test
    test_start_scores_matrix = add_HLs_2_tensor3(
        test_word_input4score, norm_HL_1_para, norm_HL_2_para, norm_HL_3_para,
        norm_HL_4_para, norm_start_U_a, true_batch_size,
        true_p_len)  #(batch, test_p_len)
    mask_test_start_return = test_start_scores_matrix * para_mask  #(batch, p_len)
    '''
    forward start info to end prediction in testing
    '''
    test_distance_matrix = T.argmax(mask_test_start_return, axis=1).dimshuffle(
        0, 'x') - T.arange(true_p_len).dimshuffle('x', 0)  #(batch, p_len)
    test_distance_trunc_matrix = T.maximum(
        -distance, T.minimum(distance,
                             test_distance_matrix)) + distance  #(batch, p_len)
    test_zero_distance_matrix = T.zeros(
        (true_batch_size * true_p_len, 2 * distance + 1))
    test_filled_distance_matrix = T.set_subtensor(
        test_zero_distance_matrix[T.arange(true_batch_size * true_p_len),
                                  test_distance_trunc_matrix.flatten()], 1.0)
    test_filled_distance_tensor3 = test_filled_distance_matrix.reshape(
        (true_batch_size,
         true_p_len, 2 * distance + 1)).dot(distance_embeddings).dimshuffle(
             0, 2, 1)  #(batch_size, distance_emb, p_len)
    test_end_word_input4score = T.concatenate(
        [test_word_input4score, test_filled_distance_tensor3],
        axis=1)  #(batch, +distance-emb, p_len)

    end_test_scores_matrix = add_HLs_2_tensor3(
        test_end_word_input4score, end_norm_HL_1_para, end_norm_HL_2_para,
        end_norm_HL_3_para, end_norm_HL_4_para, end_norm_U_a, true_batch_size,
        true_p_len)  #(batch, test_p_len)
    end_mask_test_return = end_test_scores_matrix * para_mask  #(batch, p_len)

    word_gram_1 = mask_test_start_return + end_mask_test_return
    word_gram_2 = mask_test_start_return[:, :
                                         -1] + end_mask_test_return[:,
                                                                    1:]  #(batch* hidden_size, maxsenlen-1)
    word_gram_3 = mask_test_start_return[:, :
                                         -2] + end_mask_test_return[:,
                                                                    2:]  #(batch* hidden_size, maxsenlen-2)
    word_gram_4 = mask_test_start_return[:, :
                                         -3] + end_mask_test_return[:,
                                                                    3:]  #(batch* hidden_size, maxsenlen-3)
    word_gram_5 = mask_test_start_return[:, :
                                         -4] + end_mask_test_return[:,
                                                                    4:]  #(batch* hidden_size, maxsenlen-4)
    word_pair_scores = T.concatenate(
        [word_gram_1, word_gram_2, word_gram_3, word_gram_4, word_gram_5],
        axis=1)  #(batch_size, gram_size)

    #ans words train
    ans_HL_1_para = create_ensemble_para(rng, hidden_size,
                                         overall_word_hidden_size)
    ans_HL_2_para = create_ensemble_para(rng, hidden_size, hidden_size)
    ans_HL_3_para = create_ensemble_para(rng, hidden_size, hidden_size)
    ans_HL_4_para = create_ensemble_para(rng, hidden_size, hidden_size)
    ans_U_a = create_ensemble_para(rng, 1,
                                   hidden_size + overall_word_hidden_size)
    norm_ans_U_a = normalize_matrix(ans_U_a)
    norm_ans_HL_1_para = normalize_matrix(ans_HL_1_para)
    norm_ans_HL_2_para = normalize_matrix(ans_HL_2_para)
    norm_ans_HL_3_para = normalize_matrix(ans_HL_3_para)
    norm_ans_HL_4_para = normalize_matrix(ans_HL_4_para)

    ans_scores_matrix = add_HLs_2_tensor3(word_input4score, norm_ans_HL_1_para,
                                          norm_ans_HL_2_para,
                                          norm_ans_HL_3_para,
                                          norm_ans_HL_4_para, norm_ans_U_a,
                                          batch_size, true_p_len)
    ans_scores_vec = T.nnet.softmax(
        ans_scores_matrix).flatten()  #(batch, para_len)
    ans_loss_neg_likelihood = -T.mean(T.log(ans_scores_vec[ans_indices]))

    ans_index_vec = T.zeros((batch_size, p_len_limit),
                            dtype=theano.config.floatX).flatten()
    ans_new_index = T.set_subtensor(ans_index_vec[ans_indices], 1.0)
    ans_prob_batch_posi = ans_scores_vec[ans_new_index.nonzero()]
    ans_prob_batch_nega = ans_scores_vec[(1.0 - ans_new_index).nonzero()]
    ans_repeat_posi = T.extra_ops.repeat(ans_prob_batch_posi,
                                         ans_prob_batch_nega.shape[0],
                                         axis=0)
    ans_repeat_nega = T.extra_ops.repeat(ans_prob_batch_nega.dimshuffle(
        'x', 0),
                                         ans_prob_batch_posi.shape[0],
                                         axis=0).flatten()
    ans_loss_rank = T.mean(
        T.maximum(0.0, margin - ans_repeat_posi + ans_repeat_nega))

    ans_loss = ans_loss_neg_likelihood + ans_loss_rank
    #ans words test
    test_ans_scores_matrix = add_HLs_2_tensor3(
        test_word_input4score, norm_ans_HL_1_para, norm_ans_HL_2_para,
        norm_ans_HL_3_para, norm_ans_HL_4_para, norm_ans_U_a, true_batch_size,
        true_p_len)
    test_ans_scores_matrix = test_ans_scores_matrix * para_mask  #T.nnet.softmax(test_ans_scores_matrix) #(batch, para_len)
    ans_gram_1 = test_ans_scores_matrix
    ans_gram_2 = (test_ans_scores_matrix[:, :-1] +
                  test_ans_scores_matrix[:, 1:]
                  ) / 2.0  #(batch* hidden_size, maxsenlen-1)
    ans_gram_3 = (test_ans_scores_matrix[:, :-2] +
                  test_ans_scores_matrix[:, 1:-1] +
                  test_ans_scores_matrix[:, 2:]
                  ) / 3.0  #(batch* hidden_size, maxsenlen-2)
    ans_gram_4 = (
        test_ans_scores_matrix[:, :-3] + test_ans_scores_matrix[:, 1:-2] +
        test_ans_scores_matrix[:, 2:-1] + test_ans_scores_matrix[:, 3:]
    ) / 4.0  #(batch* hidden_size, maxsenlen-3)
    ans_gram_5 = (
        test_ans_scores_matrix[:, :-4] + test_ans_scores_matrix[:, 1:-3] +
        test_ans_scores_matrix[:, 2:-2] + test_ans_scores_matrix[:, 3:-1] +
        test_ans_scores_matrix[:,
                               4:]) / 5.0  #(batch* hidden_size, maxsenlen-4)
    ans_word_scores = T.concatenate(
        [ans_gram_1, ans_gram_2, ans_gram_3, ans_gram_4, ans_gram_5],
        axis=1)  #(batch, hidden_size, maxsenlen-(0+1+2+3+4))
    '''
    form test spans and masks
    '''
    test_span_word_scores_matrix = word_pair_scores + ans_word_scores  #test_span_scores_matrix+
    test_spans_mask_1 = para_mask
    test_spans_mask_2 = para_mask[:, :
                                  -1] * para_mask[:,
                                                  1:]  #(batch* hidden_size, maxsenlen-1)
    test_spans_mask_3 = para_mask[:, :
                                  -2] * para_mask[:, 1:
                                                  -1] * para_mask[:,
                                                                  2:]  #(batch* hidden_size, maxsenlen-2)
    test_spans_mask_4 = para_mask[:, :
                                  -3] * para_mask[:, 1:
                                                  -2] * para_mask[:, 2:
                                                                  -1] * para_mask[:,
                                                                                  3:]  #(batch* hidden_size, maxsenlen-3)
    test_spans_mask_5 = para_mask[:, :
                                  -4] * para_mask[:, 1:
                                                  -3] * para_mask[:, 2:
                                                                  -2] * para_mask[:,
                                                                                  3:
                                                                                  -1] * para_mask[:,
                                                                                                  4:]
    test_spans_mask = T.concatenate([
        test_spans_mask_1, test_spans_mask_2, test_spans_mask_3,
        test_spans_mask_4, test_spans_mask_5
    ],
                                    axis=1)  #(batch, 5*p_len -)
    #     test_return=T.argmax(test_span_word_scores_matrix, axis=1) #batch T.argmax(test_span_word_scores_matrix*test_spans_mask, axis=1) #batch
    test_return = T.argmax(test_span_word_scores_matrix * test_spans_mask,
                           axis=1)  #batch

    #     params = [embeddings,char_embeddings]+NN_para+[U_a]
    params = (
        [embeddings, char_embeddings, extra_embeddings, distance_embeddings] +
        CNN_para
        #               +[span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para]
        + [start_U_a, HL_1_para, HL_2_para, HL_3_para, HL_4_para] +
        [end_U_a, end_HL_1_para, end_HL_2_para, end_HL_3_para, end_HL_4_para] +
        [ans_U_a, ans_HL_1_para, ans_HL_2_para, ans_HL_3_para, ans_HL_4_para])

    L2_reg = L2norm_paraList([
        embeddings,
        char_embeddings,
        extra_embeddings,
        distance_embeddings,
        conv_W_1,
        conv_W_2,
        conv_W_1_q,
        conv_W_2_q,
        conv_W_char,
        conv_W_3,
        conv_W_3_q,
        #     conv_W_4, conv_W_5,conv_W_4_q, conv_W_5_q,
        #     span_U_a,span_HL_1_para,span_HL_2_para,span_HL_3_para,span_HL_4_para,
        start_U_a,
        HL_1_para,
        HL_2_para,
        HL_3_para,
        HL_4_para,
        end_U_a,
        end_HL_1_para,
        end_HL_2_para,
        end_HL_3_para,
        end_HL_4_para,
        ans_U_a,
        ans_HL_1_para,
        ans_HL_2_para,
        ans_HL_3_para,
        ans_HL_4_para
    ])
    #L2_reg = L2norm_paraList(params)
    cost = word_loss + ans_loss + L2_weight * L2_reg  #span_loss+

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function([
        paragraph, questions, span_indices, word_indices, ans_indices,
        para_mask, q_mask, extra, char_paragraph, char_questions,
        char_para_mask, char_q_mask, true_p_len
    ],
                                  cost,
                                  updates=updates,
                                  on_unused_input='ignore')

    test_model = theano.function([
        paragraph, questions, para_mask, q_mask, extra, char_paragraph,
        char_questions, char_para_mask, char_q_mask, true_p_len
    ],
                                 test_return,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / test_batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(numpy.arange(n_test_batches) *
                            test_batch_size) + [test_size - test_batch_size]

    max_F1_acc = 0.0
    max_exact_acc = 0.0
    cost_i = 0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.Random(200).shuffle(train_ids)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_ids[para_id:para_id + batch_size]
            boundary_labels_batch = train_word_label_list[train_id_batch]
            ans_label_list = []
            for i in range(batch_size):
                start = boundary_labels_batch[i][0] + i * p_len_limit
                end = boundary_labels_batch[i][1] + i * p_len_limit
                ans_label_list += range(start, end + 1)
            ans_label_list = numpy.asarray(ans_label_list, dtype='int32')

            cost_i += train_model(
                train_para_list[train_id_batch], train_Q_list[train_id_batch],
                train_span_label_list[train_id_batch], boundary_labels_batch,
                ans_label_list, train_para_mask[train_id_batch],
                train_Q_mask[train_id_batch],
                train_para_extras[train_id_batch],
                train_para_char_list[train_id_batch],
                train_Q_char_list[train_id_batch],
                train_para_char_mask[train_id_batch],
                train_Q_char_mask[train_id_batch], p_len_limit)

            #print iter
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                pred_dict = {}
                q_amount = 0
                for test_para_id in test_batch_start:
                    batch_predict_ids = test_model(
                        test_para_list[test_para_id:test_para_id +
                                       test_batch_size],
                        test_Q_list[test_para_id:test_para_id +
                                    test_batch_size],
                        test_para_mask[test_para_id:test_para_id +
                                       test_batch_size],
                        test_Q_mask[test_para_id:test_para_id +
                                    test_batch_size],
                        test_para_extras[test_para_id:test_para_id +
                                         test_batch_size],
                        test_para_char_list[test_para_id:test_para_id +
                                            test_batch_size],
                        test_Q_char_list[test_para_id:test_para_id +
                                         test_batch_size],
                        test_para_char_mask[test_para_id:test_para_id +
                                            test_batch_size],
                        test_Q_char_mask[test_para_id:test_para_id +
                                         test_batch_size], test_p_len_limit)
                    test_para_wordlist_batch = test_para_wordlist_list[
                        test_para_id:test_para_id + test_batch_size]
                    q_ids_batch = q_idlist[test_para_id:test_para_id +
                                           test_batch_size]
                    q_amount += test_batch_size

                    for q in range(test_batch_size):  #for each question
                        pred_ans = decode_predict_id(
                            batch_predict_ids[q], test_para_wordlist_batch[q])
                        q_id = q_ids_batch[q]
                        pred_dict[q_id] = pred_ans
#                         print q_id, test_para_wordlist_batch[q],'\t',pred_ans
                with codecs.open(rootPath + 'predictions.txt', 'w',
                                 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath + 'dev-v1.1.json',
                                                  rootPath + 'predictions.txt')
                if F1_acc > max_F1_acc:
                    max_F1_acc = F1_acc
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc


#                     if max_exact_acc > max_EM:
#                         store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params)
#                         print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
    return max_exact_acc
Example #8
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=2000, batch_size=500, test_batch_size=500, emb_size=300, hidden_size=300,
                    L2_weight=0.0001, margin=0.5,
                    train_size=4000000, test_size=1000, 
                    max_context_len=25, max_span_len=7, max_q_len=40, max_EM=0.052):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = np.random.RandomState(23455)
    word2id,train_questions,train_questions_mask,train_lefts,train_lefts_mask,train_spans,train_spans_mask,train_rights,train_rights_mask=load_SQUAD_hinrich(train_size, max_context_len, max_span_len, max_q_len)



    test_ground_truth,test_candidates,test_questions,test_questions_mask,test_lefts,test_lefts_mask,test_spans,test_spans_mask,test_rights,test_rights_mask=load_dev_hinrich(word2id, test_size, max_context_len, max_span_len, max_q_len)
    
    
    
    

    overall_vocab_size=len(word2id)
    print 'vocab size:', overall_vocab_size


    rand_values=random_value_normal((overall_vocab_size+1, emb_size), theano.config.floatX, np.random.RandomState(1234))
#     rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()

    left=T.imatrix()  #(2*batch, len)
    left_mask=T.fmatrix() #(2*batch, len)
    span=T.imatrix()  #(2*batch, span_len)
    span_mask=T.fmatrix() #(2*batch, span_len)
    right=T.imatrix()  #(2*batch, len)
    right_mask=T.fmatrix() #(2*batch, len)
    q=T.imatrix()  #(2*batch, len_q)
    q_mask=T.fmatrix() #(2*batch, len_q)





    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    GRU1_para=[U1, W1, b1, U1_b, W1_b, b1_b]
    
    U2, W2, b2=create_GRU_para(rng, hidden_size, hidden_size)
    U2_b, W2_b, b2_b=create_GRU_para(rng, hidden_size, hidden_size)
    GRU2_para=[U2, W2, b2, U2_b, W2_b, b2_b]
    
    W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)

    attend_para=[W_a1, W_a2]
    params = [embeddings]+GRU1_para+attend_para+GRU2_para
#     load_model_from_file(rootPath+'Best_Para_dim'+str(emb_size), params)

    left_input = embeddings[left.flatten()].reshape((left.shape[0], left.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context)
    span_input = embeddings[span.flatten()].reshape((span.shape[0], span.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_span)
    right_input = embeddings[right.flatten()].reshape((right.shape[0], right.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_context)
    q_input = embeddings[q.flatten()].reshape((q.shape[0], q.shape[1], emb_size)).transpose((0, 2,1)) # (2*batch_size, emb_size, len_q)


    left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=left_input, Mask=left_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    left_reps=left_model.output_tensor #(batch, emb, para_len)

    span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=span_input, Mask=span_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    span_reps=span_model.output_tensor #(batch, emb, para_len)

    right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=right_input, Mask=right_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    right_reps=right_model.output_tensor #(batch, emb, para_len)

    q_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=q_input, Mask=q_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    q_reps=q_model.output_tensor #(batch, emb, para_len)

    #interaction
    left_reps_via_q_reps, q_reps_via_left_reps=attention_dot_prod_between_2tensors(left_reps, q_reps)
    span_reps_via_q_reps, q_reps_via_span_reps=attention_dot_prod_between_2tensors(span_reps, q_reps)
    right_reps_via_q_reps, q_reps_via_right_reps=attention_dot_prod_between_2tensors(right_reps, q_reps)

#     q_reps_via_left_reps=attention_dot_prod_between_2tensors(q_reps, left_reps)
#     q_reps_via_span_reps=attention_dot_prod_between_2tensors(q_reps, span_reps)
#     q_reps_via_right_reps=attention_dot_prod_between_2tensors(q_reps, right_reps)

    #combine


    origin_W=normalize_matrix(W_a1)
    attend_W=normalize_matrix(W_a2)

    left_origin_reps=T.dot(left_reps.dimshuffle(0, 2,1), origin_W)
    span_origin_reps=T.dot(span_reps.dimshuffle(0, 2,1), origin_W)
    right_origin_reps=T.dot(right_reps.dimshuffle(0, 2,1), origin_W)
    q_origin_reps=T.dot(q_reps.dimshuffle(0, 2,1), origin_W)

    left_attend_q_reps=T.dot(q_reps_via_left_reps.dimshuffle(0, 2,1), attend_W)
    span_attend_q_reps=T.dot(q_reps_via_span_reps.dimshuffle(0, 2,1), attend_W)
    right_attend_q_reps=T.dot(q_reps_via_right_reps.dimshuffle(0, 2,1), attend_W)

    q_attend_left_reps=T.dot(left_reps_via_q_reps.dimshuffle(0, 2,1), attend_W)
    q_attend_span_reps=T.dot(span_reps_via_q_reps.dimshuffle(0, 2,1), attend_W)
    q_attend_right_reps=T.dot(right_reps_via_q_reps.dimshuffle(0, 2,1), attend_W)


    add_left=left_origin_reps+q_attend_left_reps  #(2*batch, len ,hidden)
    add_span=span_origin_reps+q_attend_span_reps
    add_right=right_origin_reps+q_attend_right_reps

    add_q_by_left=q_origin_reps+left_attend_q_reps
    add_q_by_span=q_origin_reps+span_attend_q_reps
    add_q_by_right=q_origin_reps+right_attend_q_reps

    #second GRU


    add_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_left.dimshuffle(0,2,1), Mask=left_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_left_reps=add_left_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_span.dimshuffle(0,2,1), Mask=span_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_span_reps=add_span_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_right.dimshuffle(0,2,1), Mask=right_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_right_reps=add_right_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_q_by_left_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_left.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_q_by_left_reps=add_q_by_left_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_q_by_span_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_span.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_q_by_span_reps=add_q_by_span_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    add_q_by_right_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=add_q_by_right.dimshuffle(0,2,1), Mask=q_mask, hidden_dim=hidden_size,U=U2,W=W2,b=b2,Ub=U2_b,Wb=W2_b,bb=b2_b)
    add_q_by_right_reps=add_q_by_right_model.output_sent_rep_maxpooling #(batch, hidden_dim)

    paragraph_concat=T.concatenate([add_left_reps, add_span_reps, add_right_reps], axis=1) #(batch, 3*hidden)
    question_concat=T.concatenate([add_q_by_left_reps, add_q_by_span_reps, add_q_by_right_reps], axis=1)   #(batch, 3*hidden)

    simi_list=cosine_row_wise_twoMatrix(paragraph_concat, question_concat)  #(2*batch)

    pos_simi_vec=simi_list[::2]
    neg_simi_vec=simi_list[1::2]

    raw_loss=T.maximum(0.0, margin+neg_simi_vec-pos_simi_vec)



    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    
#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=T.sum(raw_loss)#+ConvGRU_1.error#


    accumulator=[]
    for para_i in params:
        eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))



    train_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([left, left_mask, span, span_mask, right, right_mask, q, q_mask], simi_list, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size    #batch_size means how many pairs
    remain_train=train_size%batch_size
#     train_batch_start=list(np.arange(n_train_batches)*batch_size*2)+[train_size*2-batch_size*2] # always ou shu
    if remain_train>0:
        train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] 
    else:
        train_batch_start=list(np.arange(n_train_batches)*batch_size)




    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_odd_ids = list(np.arange(train_size)*2)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        random.shuffle(train_odd_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_list=[[train_odd_id, train_odd_id+1] for train_odd_id in train_odd_ids[para_id:para_id+batch_size]]
            train_id_list=sum(train_id_list,[])
#             print train_id_list
            cost_i+= train_model(
                                np.asarray([train_lefts[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_lefts_mask[id] for id in train_id_list], dtype=theano.config.floatX),
                                np.asarray([train_spans[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_spans_mask[id] for id in train_id_list], dtype=theano.config.floatX),
                                np.asarray([train_rights[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_rights_mask[id] for id in train_id_list], dtype=theano.config.floatX),
                                np.asarray([train_questions[id] for id in train_id_list], dtype='int32'),
                                np.asarray([train_questions_mask[id] for id in train_id_list], dtype=theano.config.floatX))

            #print iter
            if iter%100==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()

                exact_match=0.0
                F1_match=0.0


                for test_pair_id in range(test_size):
                    test_example_lefts=test_lefts[test_pair_id]
                    test_example_lefts_mask=test_lefts_mask[test_pair_id]
                    test_example_spans=test_spans[test_pair_id]
                    test_example_spans_mask=test_spans_mask[test_pair_id]
                    test_example_rights=test_rights[test_pair_id]
                    test_example_rights_mask=test_rights_mask[test_pair_id]
                    test_example_questions=test_questions[test_pair_id]
                    test_example_questions_mask=test_questions_mask[test_pair_id]       
                    test_example_candidates=test_candidates[test_pair_id]
                    
                    
                    
                    test_example_size=len(test_example_lefts)
#                     print 'test_pair_id, test_example_size:', test_pair_id, test_example_size
                    if test_example_size < test_batch_size:
                        #pad
                        pad_size=test_batch_size-test_example_size
                        test_example_lefts+=test_example_lefts[-1:]*pad_size
                        test_example_lefts_mask+=test_example_lefts_mask[-1:]*pad_size
                        test_example_spans+=test_example_spans[-1:]*pad_size
                        test_example_spans_mask+=test_example_spans_mask[-1:]*pad_size
                        test_example_rights+=test_example_rights[-1:]*pad_size
                        test_example_rights_mask+=test_example_rights_mask[-1:]*pad_size
                        test_example_questions+=test_example_questions[-1:]*pad_size
                        test_example_questions_mask+=test_example_questions_mask[-1:]*pad_size 
                        test_example_candidates+=test_example_candidates[-1:]*pad_size
                        
                        test_example_size=test_batch_size
                    
                                            
                    n_test_batches=test_example_size/test_batch_size
                    n_test_remain=test_example_size%test_batch_size
                    if n_test_remain > 0:
                        test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_example_size-test_batch_size]
                    else:
                        test_batch_start=list(np.arange(n_test_batches)*test_batch_size)
                    all_simi_list=[]
                    all_cand_list=[]
                    for test_para_id in test_batch_start:
                        simi_return_vector=test_model(
                                    np.asarray(test_example_lefts[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_lefts_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                    np.asarray(test_example_spans[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_spans_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                    np.asarray(test_example_rights[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_rights_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX),
                                    np.asarray(test_example_questions[test_para_id:test_para_id+test_batch_size], dtype='int32'),
                                    np.asarray(test_example_questions_mask[test_para_id:test_para_id+test_batch_size], dtype=theano.config.floatX))
                        candidate_list=test_example_candidates[test_para_id:test_para_id+test_batch_size]
                        all_simi_list+=list(simi_return_vector)
                        all_cand_list+=candidate_list
                    top1_cand=all_cand_list[np.argsort(all_simi_list)[-1]]
#                     print top1_cand, test_ground_truth[test_pair_id]

                    if top1_cand == test_ground_truth[test_pair_id]:
                        exact_match+=1
                    F1=macrof1(top1_cand, test_ground_truth[test_pair_id])
#                     print '\t\t\t', F1
                    F1_match+=F1
#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                F1_acc=F1_match/test_size
                exact_acc=exact_match/test_size
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
#                     store_model_to_file(params, emb_size)
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
                    if max_exact_acc > max_EM:
                        store_model_to_file(rootPath+'Best_Para_'+str(max_EM), params)
                        print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc




            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=2000, batch_size=200, test_batch_size=200, emb_size=300, hidden_size=20,
                    L2_weight=0.0001, p_len_limit=400, test_p_len_limit=40, q_len_limit=20, filter_size = [5,5], margin=0.85, max_EM=50.302743615):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/SQuAD/';
    rng = numpy.random.RandomState(23455)
    

    word2id={}
    #questions,paragraphs,q_masks,p_masks,labels, word2id
    train_Q_list,train_para_list, train_Q_mask, train_para_mask, train_label_list, word2id=load_squad_cnn_rank_span_train(word2id, p_len_limit, q_len_limit)
    train_size=len(train_para_list)
    test_Q_list, test_para_list,  test_Q_mask, test_para_mask, q_idlist, word2id, test_para_wordlist_list= load_squad_cnn_rank_span_dev(word2id, test_p_len_limit, q_len_limit)
    test_size=len(test_para_list)
    train_Q_list = numpy.asarray(train_Q_list, dtype='int32')
    train_para_list = numpy.asarray(train_para_list, dtype='int32')
    train_Q_mask = numpy.asarray(train_Q_mask, dtype=theano.config.floatX)
    train_para_mask = numpy.asarray(train_para_mask, dtype=theano.config.floatX)
    train_label_list = numpy.asarray(train_label_list, dtype='int32')

    test_Q_list = numpy.asarray(test_Q_list, dtype='int32')
    test_para_list = numpy.asarray(test_para_list, dtype='int32')
    test_Q_mask = numpy.asarray(test_Q_mask, dtype=theano.config.floatX)
    test_para_mask = numpy.asarray(test_para_mask, dtype=theano.config.floatX)
    
    

    vocab_size = len(word2id)
    print 'vocab size: ', vocab_size
    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_glove()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)


    # allocate symbolic variables for the data
#     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    gold_indices= T.ivector() #batch
    para_mask=T.fmatrix('para_mask')
    q_mask=T.fmatrix('q_mask')

#     paragraph = T.imatrix('paragraph')
#     questions = T.imatrix('questions')
#     gold_indices= T.ivector() #batch
#     para_mask=T.fmatrix('para_mask')
#     q_mask=T.fmatrix('q_mask')
#     extraF=T.ftensor3('extraF') # should be in shape (batch, wordsize, 3)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size=paragraph.shape[0]

#     norm_extraF=normalize_matrix(extraF)

    U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b=create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para=[U1, W1, b1, U1_b, W1_b, b1_b]

    U_e1, W_e1, b_e1=create_GRU_para(rng, 3*hidden_size, hidden_size)
    U_e1_b, W_e1_b, b_e1_b=create_GRU_para(rng, 3*hidden_size, hidden_size)
    paragraph_para_e1=[U_e1, W_e1, b_e1, U_e1_b, W_e1_b, b_e1_b]


    UQ, WQ, bQ=create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b=create_GRU_para(rng, emb_size, hidden_size)
    Q_para=[UQ, WQ, bQ, UQ_b, WQ_b, bQ_b]

#     W_a1 = create_ensemble_para(rng, hidden_size, hidden_size)# init_weights((2*hidden_size, hidden_size))
#     W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 1, 2*hidden_size) # 3 extra features
#     LR_b = theano.shared(value=numpy.zeros((2,),
#                                                  dtype=theano.config.floatX),  # @UndefinedVariable
#                                name='LR_b', borrow=True)

    HL_paras=[U_a]
    params = [embeddings]+paragraph_para+Q_para+paragraph_para_e1+HL_paras

#     load_model_from_file(rootPath+'Best_Paras_conv_50.302743614', params)


    paragraph_input = embeddings[paragraph.flatten()].reshape((true_batch_size, paragraph.shape[1], emb_size)).transpose((0, 2,1)) # (batch_size, emb_size, maxparalen)
#     concate_paragraph_input=T.concatenate([paragraph_input, norm_extraF.dimshuffle((0,2,1))], axis=1)


    paragraph_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=paragraph_input, Mask=para_mask, hidden_dim=hidden_size,U=U1,W=W1,b=b1,Ub=U1_b,Wb=W1_b,bb=b1_b)
    para_reps=paragraph_model.output_tensor #(batch, emb, para_len)

#     #LSTM
#     fwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
#     paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
#     para_reps=paragraph_model.output_tensor

    Qs_emb = embeddings[questions.flatten()].reshape((true_batch_size, questions.shape[1], emb_size)).transpose((0, 2,1)) #(#questions, emb_size, maxsenlength)

    questions_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=Qs_emb, Mask=q_mask, hidden_dim=hidden_size, U=UQ,W=WQ,b=bQ, Ub=UQ_b, Wb=WQ_b, bb=bQ_b)
    questions_reps_tensor=questions_model.output_tensor
    questions_reps=questions_model.output_sent_rep_maxpooling.reshape((true_batch_size, 1, hidden_size)) #(batch, 1, hidden)
    questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)  #(batch, para_len, hidden)

#     #LSTM for questions
#     fwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     bwd_LSTM_q_dict=create_LSTM_para(rng, emb_size, hidden_size)
#     Q_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
#     questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask(Qs_emb, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
#     questions_reps_tensor=questions_model.output_tensor





#     new_labels=T.gt(labels[:,:-1]+labels[:,1:], 0.0)
#     ConvGRU_1=Conv_then_GRU_then_Classify(rng, concate_paragraph_input, Qs_emb, para_len_limit, q_len_limit, emb_size+3, hidden_size, emb_size, 2, batch_size, para_mask, q_mask, new_labels, 2)
#     ConvGRU_1_dis=ConvGRU_1.masked_dis_inprediction
#     padding_vec = T.zeros((batch_size, 1), dtype=theano.config.floatX)
#     ConvGRU_1_dis_leftpad=T.concatenate([padding_vec, ConvGRU_1_dis], axis=1)
#     ConvGRU_1_dis_rightpad=T.concatenate([ConvGRU_1_dis, padding_vec], axis=1)
#     ConvGRU_1_dis_into_unigram=0.5*(ConvGRU_1_dis_leftpad+ConvGRU_1_dis_rightpad)


    #
    def example_in_batch(para_matrix, q_matrix):
        #assume both are (hidden, len)
        transpose_para_matrix=para_matrix.T
        interaction_matrix=T.dot(transpose_para_matrix, q_matrix) #(para_len, q_len)
        norm_interaction_matrix=T.nnet.softmax(interaction_matrix)
#         norm_interaction_matrix=T.maximum(0.0, interaction_matrix)
        return T.dot(q_matrix, norm_interaction_matrix.T)/T.sum(norm_interaction_matrix.T, axis=0).dimshuffle('x',0) #(len, para_len)
    batch_q_reps, updates = theano.scan(fn=example_in_batch,
                                   outputs_info=None,
                                   sequences=[para_reps, questions_reps_tensor])    #batch_q_reps (batch, hidden, para_len)



    #para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1), all are in (batch, hidden , para_len)
    ensemble_para_reps_tensor=T.concatenate([para_reps, batch_q_reps, questions_reps.dimshuffle(0,2,1)], axis=1) #(batch, 3*hidden+3, para_len)
    para_ensemble_model=Bd_GRU_Batch_Tensor_Input_with_Mask(X=ensemble_para_reps_tensor, Mask=para_mask, hidden_dim=hidden_size,U=U_e1,W=W_e1,b=b_e1,Ub=U_e1_b,Wb=W_e1_b,bb=b_e1_b)
    para_reps_tensor4score=para_ensemble_model.output_tensor #(batch, hidden ,para_len)
    #for span reps
    span_1=T.concatenate([para_reps_tensor4score, para_reps_tensor4score], axis=1) #(batch, 2*hidden ,para_len)
    span_2=T.concatenate([para_reps_tensor4score[:,:,:-1], para_reps_tensor4score[:,:,1:]], axis=1) #(batch, 2*hidden ,para_len-1)
    span_3=T.concatenate([para_reps_tensor4score[:,:,:-2], para_reps_tensor4score[:,:,2:]], axis=1) #(batch, 2*hidden ,para_len-2)
    span_4=T.concatenate([para_reps_tensor4score[:,:,:-3], para_reps_tensor4score[:,:,3:]], axis=1) #(batch, 2*hidden ,para_len-3)
    span_5=T.concatenate([para_reps_tensor4score[:,:,:-4], para_reps_tensor4score[:,:,4:]], axis=1) #(batch, 2*hidden ,para_len-4)
    span_6=T.concatenate([para_reps_tensor4score[:,:,:-5], para_reps_tensor4score[:,:,5:]], axis=1) #(batch, 2*hidden ,para_len-5)
    span_7=T.concatenate([para_reps_tensor4score[:,:,:-6], para_reps_tensor4score[:,:,6:]], axis=1) #(batch, 2*hidden ,para_len-6)

    span_8=T.concatenate([para_reps_tensor4score[:,:,:-7], para_reps_tensor4score[:,:,7:]], axis=1) #(batch, 2*hidden ,para_len-7)
    span_9=T.concatenate([para_reps_tensor4score[:,:,:-8], para_reps_tensor4score[:,:,8:]], axis=1) #(batch, 2*hidden ,para_len-8)
    span_10=T.concatenate([para_reps_tensor4score[:,:,:-9], para_reps_tensor4score[:,:,9:]], axis=1) #(batch, 2*hidden ,para_len-9)
    span_11=T.concatenate([para_reps_tensor4score[:,:,:-10], para_reps_tensor4score[:,:,10:]], axis=1) #(batch, 2*hidden ,para_len-10)
    span_12=T.concatenate([para_reps_tensor4score[:,:,:-11], para_reps_tensor4score[:,:,11:]], axis=1) #(batch, 2*hidden ,para_len-11)
    span_13=T.concatenate([para_reps_tensor4score[:,:,:-12], para_reps_tensor4score[:,:,12:]], axis=1) #(batch, 2*hidden ,para_len-12)

    span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7,
                             span_8, span_9, span_10, span_11, span_12, span_13], axis=2) #(batch, 2*hidden, 13*para_len-78)
    test_span_reps=T.concatenate([span_1, span_2, span_3, span_4, span_5, span_6, span_7], axis=2) #(batch, 2*hidden, 5*para_len-10)  #, span_6, span_7
    #score each span reps
    norm_U_a=normalize_matrix(U_a)
    span_scores_tensor=T.dot(span_reps.dimshuffle(0,2,1), norm_U_a)  #(batch, 13*para_len-78, 1)
    span_scores=T.nnet.softmax(span_scores_tensor.reshape((true_batch_size, 13*paragraph.shape[1]-78))) #(batch, 7*para_len-21)
    loss=-T.mean(T.log(span_scores[T.arange(true_batch_size), gold_indices]))
    
    test_span_scores_tensor=T.dot(test_span_reps.dimshuffle(0,2,1), norm_U_a)  #(batch, 7*para_len-21, 1)
    test_span_scores=T.nnet.softmax(test_span_scores_tensor.reshape((true_batch_size, 7*paragraph.shape[1]-21))) #(batch, 7*para_len-21)    
    test_return=T.argmax(test_span_scores, axis=1) #batch


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]

#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost=loss#+ConvGRU_1.error#


    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
  
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
  
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #AdaGrad
        updates.append((acc_i, acc))

#     updates=Adam(cost, params, lr=0.0001)

    train_model = theano.function([paragraph, questions,gold_indices, para_mask, q_mask], cost, updates=updates,on_unused_input='ignore')

    test_model = theano.function([paragraph, questions,para_mask, q_mask], test_return, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size
#     remain_train=train_size%batch_size
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)+[train_size-batch_size]


    n_test_batches=test_size/test_batch_size
#     remain_test=test_size%batch_size
    test_batch_start=list(numpy.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size]


    max_F1_acc=0.0
    max_exact_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_ids[para_id:para_id+batch_size]
            cost_i+= train_model(
                                 train_para_list[train_id_batch],
                                 train_Q_list[train_id_batch],
                                 train_label_list[train_id_batch],
                                 train_para_mask[train_id_batch],
                                 train_Q_mask[train_id_batch])
            #print iter
            if iter%10==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                print 'Testing...'
                past_time = time.time()
                pred_dict={}
                q_amount=0
                for test_para_id in test_batch_start:
                    batch_predict_ids=test_model(
                                                 test_para_list[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_list[test_para_id:test_para_id+test_batch_size],
                                                 test_para_mask[test_para_id:test_para_id+test_batch_size],
                                                 test_Q_mask[test_para_id:test_para_id+test_batch_size])
                    test_para_wordlist_batch=test_para_wordlist_list[test_para_id:test_para_id+test_batch_size]
                    q_ids_batch=q_idlist[test_para_id:test_para_id+test_batch_size]
                    q_amount+=test_batch_size
 
                    for q in range(test_batch_size): #for each question
                        pred_ans=decode_predict_id(batch_predict_ids[q], test_para_wordlist_batch[q])
                        q_id=q_ids_batch[q]
                        pred_dict[q_id]=pred_ans
                with codecs.open(rootPath+'predictions.txt', 'w', 'utf-8') as outfile:
                    json.dump(pred_dict, outfile)
                F1_acc, exact_acc = standard_eval(rootPath+'dev-v1.1.json', rootPath+'predictions.txt')
                if F1_acc> max_F1_acc:
                    max_F1_acc=F1_acc
                if exact_acc> max_exact_acc:
                    max_exact_acc=exact_acc
#                     if max_exact_acc > max_EM:
#                         store_model_to_file(rootPath+'Best_Paras_google_'+str(max_exact_acc), params)
#                         print 'Finished storing best  params at:', max_exact_acc
                print 'current average F1:', F1_acc, '\t\tmax F1:', max_F1_acc, 'current  exact:', exact_acc, '\t\tmax exact_acc:', max_exact_acc
                





            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #10
0
def evaluate_lenet5(learning_rate=0.5,
                    n_epochs=2000,
                    batch_size=500,
                    emb_size=300,
                    hidden_size=300,
                    L2_weight=0.0001,
                    para_len_limit=700,
                    q_len_limit=40):

    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = numpy.random.RandomState(23455)
    train_para_list, train_Q_list, train_label_list, train_para_mask, train_mask, word2id, train_feature_matrixlist = load_train(
        para_len_limit, q_len_limit)
    train_size = len(train_para_list)
    if train_size != len(train_Q_list) or train_size != len(
            train_label_list) or train_size != len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_Q_list, test_para_mask, test_mask, overall_vocab_size, overall_word2id, test_text_list, q_ansSet_list, test_feature_matrixlist = load_dev_or_test(
        word2id, para_len_limit, q_len_limit)
    test_size = len(test_para_list)
    if test_size != len(test_Q_list) or test_size != len(
            test_mask) or test_size != len(test_para_mask):
        print 'test_size!=len(test_Q_list) or test_size!=len(test_mask) or test_size!=len(test_para_mask)'
        exit(0)

    id2word = {y: x for x, y in overall_word2id.iteritems()}
    word2vec = load_word2vec()

    rand_values = random_value_normal((overall_vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    #     rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    # allocate symbolic variables for the data
    #     index = T.lscalar()
    paragraph = T.imatrix('paragraph')
    questions = T.imatrix('questions')
    labels = T.imatrix('labels')
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')
    extraF = T.ftensor3('extraF')  # should be in shape (batch, wordsize, 3)

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    paragraph_input = embeddings[paragraph.flatten()].reshape(
        (paragraph.shape[0], paragraph.shape[1], emb_size)).transpose(
            (0, 2, 1))  # (batch_size, emb_size, maxparalen)
    #
    # #     BdGRU(rng, str(0), shape, X, mask, is_train = 1, batch_size = 1, p = 0.5)
    #
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size)
    U1_b, W1_b, b1_b = create_GRU_para(rng, emb_size, hidden_size)
    paragraph_para = [U1, W1, b1, U1_b, W1_b, b1_b]
    paragraph_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=paragraph_input,
        Mask=para_mask,
        hidden_dim=hidden_size,
        U=U1,
        W=W1,
        b=b1,
        Ub=U1_b,
        Wb=W1_b,
        bb=b1_b)
    para_reps = paragraph_model.output_tensor  #(batch, emb, para_len)

    Qs_emb = embeddings[questions.flatten()].reshape(
        (questions.shape[0], questions.shape[1], emb_size)).transpose(
            (0, 2, 1))  #(#questions, emb_size, maxsenlength)
    UQ, WQ, bQ = create_GRU_para(rng, emb_size, hidden_size)
    UQ_b, WQ_b, bQ_b = create_GRU_para(rng, emb_size, hidden_size)
    Q_para = [UQ, WQ, bQ, UQ_b, WQ_b, bQ_b]
    questions_model = Bd_GRU_Batch_Tensor_Input_with_Mask(
        X=Qs_emb,
        Mask=q_mask,
        hidden_dim=hidden_size,
        U=UQ,
        W=WQ,
        b=bQ,
        Ub=UQ_b,
        Wb=WQ_b,
        bb=bQ_b)
    questions_reps = questions_model.output_sent_rep_maxpooling.reshape(
        (batch_size, 1, hidden_size))  #(batch, 2*out_size)
    #questions_reps=T.repeat(questions_reps, para_reps.shape[2], axis=1)

    #attention distributions
    W_a1 = create_ensemble_para(
        rng, hidden_size,
        hidden_size)  # init_weights((2*hidden_size, hidden_size))
    W_a2 = create_ensemble_para(rng, hidden_size, hidden_size)
    U_a = create_ensemble_para(rng, 2, hidden_size + 3)  # 3 extra features

    norm_W_a1 = normalize_matrix(W_a1)
    norm_W_a2 = normalize_matrix(W_a2)
    norm_U_a = normalize_matrix(U_a)

    LR_b = theano.shared(
        value=numpy.zeros((2, ),
                          dtype=theano.config.floatX),  # @UndefinedVariable
        name='LR_b',
        borrow=True)

    attention_paras = [W_a1, W_a2, U_a, LR_b]

    transformed_para_reps = T.tanh(
        T.dot(para_reps.transpose((0, 2, 1)), norm_W_a2))
    transformed_q_reps = T.tanh(T.dot(questions_reps, norm_W_a1))
    #transformed_q_reps=T.repeat(transformed_q_reps, transformed_para_reps.shape[1], axis=1)

    add_both = 0.5 * (transformed_para_reps + transformed_q_reps)
    prior_att = T.concatenate([add_both, normalize_matrix(extraF)], axis=2)

    #prior_att=T.concatenate([transformed_para_reps, transformed_q_reps], axis=2)
    valid_indices = para_mask.flatten().nonzero()[0]

    layer3 = LogisticRegression(rng,
                                input=prior_att.reshape(
                                    (batch_size * prior_att.shape[1],
                                     hidden_size + 3)),
                                n_in=hidden_size + 3,
                                n_out=2,
                                W=norm_U_a,
                                b=LR_b)
    #error =layer3.negative_log_likelihood(labels.flatten()[valid_indices])
    error = -T.mean(
        T.log(layer3.p_y_given_x)
        [valid_indices,
         labels.flatten()[valid_indices]])  #[T.arange(y.shape[0]), y])

    distributions = layer3.p_y_given_x[:, -1].reshape(
        (batch_size, para_mask.shape[1]))
    #distributions=layer3.y_pred.reshape((batch_size, para_mask.shape[1]))
    masked_dis = distributions * para_mask
    '''
    strength = T.tanh(T.dot(prior_att, norm_U_a)) #(batch, #word, 1)    
    distributions=debug_print(strength.reshape((batch_size, paragraph.shape[1])), 'distributions')
    
    para_mask=para_mask
    masked_dis=distributions*para_mask
#     masked_label=debug_print(labels*para_mask, 'masked_label')
#     error=((masked_dis-masked_label)**2).mean()


    label_mask=T.gt(labels,0.0)
    neg_label_mask=T.lt(labels,0.0)
    dis_masked=distributions*label_mask
    remain_dis_masked=distributions*neg_label_mask
    
    ans_size=T.sum(label_mask)
    non_ans_size=T.sum(neg_label_mask)
    pos_error=T.sum((dis_masked-label_mask)**2)/ans_size
    neg_error=T.sum((remain_dis_masked-(-neg_label_mask))**2)/non_ans_size
    error=pos_error+0.5*neg_error #(ans_size*1.0/non_ans_size)*
    '''

    #     def AttentionLayer(q_rep, ext_M):
    #         theano_U_a=debug_print(norm_U_a, 'norm_U_a')
    #         prior_att=debug_print(T.nnet.sigmoid(T.dot(q_rep, norm_W_a1).reshape((1, hidden_size)) + T.dot(paragraph_model.output_matrix.transpose(), norm_W_a2)), 'prior_att')
    #        f __name__ == '__main__':
    #         prior_att=T.concatenate([prior_att, ext_M], axis=1)
    #
    #         strength = debug_print(T.tanh(T.dot(prior_att, theano_U_a)), 'strength') #(#word, 1)
    #         return strength.transpose() #(1, #words)

    #     distributions, updates = theano.scan(
    #     AttentionLayer,
    #     sequences=[questions_reps,extraF] )

    #     distributions=debug_print(distributions.reshape((questions.shape[0],paragraph.shape[0])), 'distributions')
    #     labels=debug_print(labels, 'labels')
    #     label_mask=T.gt(labels,0.0)
    #     neg_label_mask=T.lt(labels,0.0)
    #     dis_masked=distributions*label_mask
    #     remain_dis_masked=distributions*neg_label_mask
    #     pos_error=((dis_masked-1)**2).mean()
    #     neg_error=((remain_dis_masked-(-1))**2).mean()
    #     error=pos_error+(T.sum(label_mask)*1.0/T.sum(neg_label_mask))*neg_error

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [embeddings] + paragraph_para + Q_para + attention_paras
    L2_reg = L2norm_paraList(
        [embeddings, U1, W1, U1_b, W1_b, UQ, WQ, UQ_b, WQ_b, W_a1, W_a2, U_a])
    #L2_reg = L2norm_paraList(params)
    cost = error  #+L2_weight*L2_reg

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [paragraph, questions, labels, para_mask, q_mask, extraF],
        error,
        updates=updates,
        on_unused_input='ignore')

    test_model = theano.function(
        [paragraph, questions, para_mask, q_mask, extraF],
        masked_dis,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(numpy.arange(n_train_batches) *
                             batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    #     remain_test=test_size%batch_size
    test_batch_start = list(
        numpy.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    max_exact_acc = 0.0
    cost_i = 0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #shuffle(train_batch_start)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            #             haha=para_mask[para_id:para_id+batch_size]
            #             print haha
            #             for i in range(batch_size):
            #                 print len(haha[i])
            cost_i += train_model(
                np.asarray(train_para_list[para_id:para_id + batch_size],
                           dtype='int32'),
                np.asarray(train_Q_list[para_id:para_id + batch_size],
                           dtype='int32'),
                np.asarray(train_label_list[para_id:para_id + batch_size],
                           dtype='int32'),
                np.asarray(train_para_mask[para_id:para_id + batch_size],
                           dtype=theano.config.floatX),
                np.asarray(train_mask[para_id:para_id + batch_size],
                           dtype=theano.config.floatX),
                np.asarray(train_feature_matrixlist[para_id:para_id +
                                                    batch_size],
                           dtype=theano.config.floatX))

            #print iter
            if iter % 10 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                print 'Testing...'
                past_time = time.time()

                exact_match = 0.0
                q_amount = 0
                for test_para_id in test_batch_start:
                    distribution_matrix = test_model(
                        np.asarray(test_para_list[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype='int32'),
                        np.asarray(test_Q_list[test_para_id:test_para_id +
                                               batch_size],
                                   dtype='int32'),
                        np.asarray(test_para_mask[test_para_id:test_para_id +
                                                  batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(test_mask[test_para_id:test_para_id +
                                             batch_size],
                                   dtype=theano.config.floatX),
                        np.asarray(
                            test_feature_matrixlist[test_para_id:test_para_id +
                                                    batch_size],
                            dtype=theano.config.floatX))

                    #                     print distribution_matrix
                    test_para_wordlist_list = test_text_list[
                        test_para_id:test_para_id + batch_size]
                    para_gold_ansset_list = q_ansSet_list[
                        test_para_id:test_para_id + batch_size]
                    paralist_extra_features = test_feature_matrixlist[
                        test_para_id:test_para_id + batch_size]
                    sub_para_mask = test_para_mask[test_para_id:test_para_id +
                                                   batch_size]
                    para_len = len(test_para_wordlist_list[0])
                    if para_len != len(distribution_matrix[0]):
                        print 'para_len!=len(distribution_matrix[0]):', para_len, len(
                            distribution_matrix[0])
                        exit(0)
#                     q_size=len(distribution_matrix)
                    q_amount += batch_size
                    #                     print q_size
                    #                     print test_para_word_list
                    for q in range(batch_size):  #for each question
                        #                         if len(distribution_matrix[q])!=len(test_label_matrix[q]):
                        #                             print 'len(distribution_matrix[q])!=len(test_label_matrix[q]):', len(distribution_matrix[q]), len(test_label_matrix[q])
                        #                         else:
                        #                             ss=len(distribution_matrix[q])
                        #                             combine_list=[]
                        #                             for ii in range(ss):
                        #                                 combine_list.append(str(distribution_matrix[q][ii])+'('+str(test_label_matrix[q][ii])+')')
                        #                             print combine_list
                        #                         exit(0)
                        #                         print 'distribution_matrix[q]:',distribution_matrix[q]
                        pred_ans = extract_ansList_attentionList(
                            test_para_wordlist_list[q], distribution_matrix[q],
                            np.asarray(paralist_extra_features[q],
                                       dtype=theano.config.floatX),
                            sub_para_mask[q])
                        q_gold_ans_set = para_gold_ansset_list[q]

                        F1 = MacroF1(pred_ans, q_gold_ans_set)
                        exact_match += F1


#                         match_amount=len(pred_ans_set & q_gold_ans_set)
# #                         print 'q_gold_ans_set:', q_gold_ans_set
# #                         print 'pred_ans_set:', pred_ans_set
#                         if match_amount>0:
#                             exact_match+=match_amount*1.0/len(pred_ans_set)
                exact_acc = exact_match / q_amount
                if exact_acc > max_exact_acc:
                    max_exact_acc = exact_acc
                print 'current average F1:', exact_acc, '\t\tmax F1:', max_exact_acc

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.005, n_epochs=2000, batch_size=300, test_batch_size=400, emb_size=50, hidden_size=300, HL_hidden_size=200,
                    L2_weight=0.0001, train_size=None, test_size=None, batch_size_pred=400, trichar_len=15,char_emb_size=50,
                    para_len=101, question_len=20, c_len=1, model_type='train'):

    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/Users/cisintern/hs/l/workhs/yin/20170320/';
    storePath='/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = np.random.RandomState(23455)
    
    word2id={}
    trichar2id={}
    word2id['UNK']=0 # use it to pad 
    #word2id, trichar2id, questions,questions_mask,paras,paras_mask,labels, isInQ_para, paras_shape, questions_shape, types, types_shape,question_trichar_ids,question_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks
    word2id, trichar2id,train_questions,train_questions_mask,train_paras,train_paras_mask,train_labels, train_islabels, train_paras_shape, train_questions_shape, train_types, train_types_shape,train_question_trichar_ids,train_question_trichar_masks,train_para_trichar_ids,train_para_trichar_masks,train_type_trichar_ids,train_type_trichar_masks=load_SQUAD_hinrich_v4(train_size, para_len, question_len, trichar_len, word2id,trichar2id, rootPath+'trn20170320.txt')
    word2id, trichar2id,test_questions,test_questions_mask,test_paras,test_paras_mask,test_labels, test_islabels, test_paras_shape, test_questions_shape, test_types, test_types_shape,test_question_trichar_ids,test_question_trichar_masks,test_para_trichar_ids,test_para_trichar_masks,test_type_trichar_ids,test_type_trichar_masks=load_SQUAD_hinrich_v4(test_size, para_len, question_len, trichar_len,word2id, trichar2id, rootPath+'dev.big.20170320.txt')
    word2id, trichar2id,test_questions,test_questions_mask,test_paras,test_paras_mask,test_labels, test_islabels, test_paras_shape, test_questions_shape, test_types, test_types_shape,test_question_trichar_ids,test_question_trichar_masks,test_para_trichar_ids,test_para_trichar_masks,test_type_trichar_ids,test_type_trichar_masks=load_SQUAD_hinrich_v4(test_size, para_len, question_len, trichar_len,word2id, trichar2id, rootPath+'dev20170320.txt')

    print 'word2id size for bigger dataset:', len(word2id), 'trichar size:', len(trichar2id)



    train_size=len(train_questions)
    test_size = len(test_questions) #50010#
    
    train_questions = np.asarray(train_questions, dtype='int32')
    train_questions_shape = np.asarray(train_questions_shape, dtype='int32')
    train_questions_mask = np.asarray(train_questions_mask, dtype=theano.config.floatX)
    train_paras = np.asarray(train_paras, dtype='int32')
    train_paras_shape = np.asarray(train_paras_shape, dtype='int32')
    train_paras_mask = np.asarray(train_paras_mask, dtype=theano.config.floatX)

    train_types = np.asarray(train_types, dtype='int32')
    train_types_shape = np.asarray(train_types_shape, dtype='int32')
    
#     train_c_ids = np.asarray(train_c_ids, dtype='int32')
#     train_c_ids_shape = np.asarray(train_c_ids_shape, dtype='int32')
#     train_c_masks = np.asarray(train_c_masks, dtype=theano.config.floatX)
    
    train_islabels = np.asarray(train_islabels, dtype=theano.config.floatX)

#     train_c_heads = np.asarray(train_c_heads, dtype='int32')
#     train_c_tails = np.asarray(train_c_tails, dtype='int32')
    train_labels = np.asarray(train_labels, dtype='int32')
    #train_question_trichar_ids,train_question_trichar_masks,train_para_trichar_ids,train_para_trichar_masks,train_type_trichar_ids,train_type_trichar_masks
    train_question_trichar_ids = np.asarray(train_question_trichar_ids, dtype='int32')
    train_question_trichar_masks = np.asarray(train_question_trichar_masks, dtype=theano.config.floatX)
    train_para_trichar_ids = np.asarray(train_para_trichar_ids, dtype='int32')
    train_para_trichar_masks = np.asarray(train_para_trichar_masks, dtype=theano.config.floatX)
    train_type_trichar_ids = np.asarray(train_type_trichar_ids, dtype='int32')
    train_type_trichar_masks = np.asarray(train_type_trichar_masks, dtype=theano.config.floatX)
    
    
    
    test_questions = np.asarray(test_questions, dtype='int32')
    test_questions_shape = np.asarray(test_questions_shape, dtype='int32')
    test_questions_mask = np.asarray(test_questions_mask, dtype=theano.config.floatX)
    test_paras = np.asarray(test_paras, dtype='int32')
    test_paras_shape = np.asarray(test_paras_shape, dtype='int32')
    test_paras_mask = np.asarray(test_paras_mask, dtype=theano.config.floatX)

    test_types = np.asarray(test_types, dtype='int32')
    test_types_shape = np.asarray(test_types_shape, dtype='int32')
    
#     test_c_ids = np.asarray(test_c_ids, dtype='int32')
#     test_c_ids_shape = np.asarray(test_c_ids_shape, dtype='int32')
#     test_c_masks = np.asarray(test_c_masks, dtype=theano.config.floatX)
    test_islabels = np.asarray(test_islabels, dtype=theano.config.floatX)
#     test_c_heads = np.asarray(test_c_heads, dtype='int32')
#     test_c_tails = np.asarray(test_c_tails, dtype='int32')
    test_labels = np.asarray(test_labels, dtype='int32')
    test_question_trichar_ids = np.asarray(test_question_trichar_ids, dtype='int32')
    test_question_trichar_masks = np.asarray(test_question_trichar_masks, dtype=theano.config.floatX)
    test_para_trichar_ids = np.asarray(test_para_trichar_ids, dtype='int32')
    test_para_trichar_masks = np.asarray(test_para_trichar_masks, dtype=theano.config.floatX)
    test_type_trichar_ids = np.asarray(test_type_trichar_ids, dtype='int32')
    test_type_trichar_masks = np.asarray(test_type_trichar_masks, dtype=theano.config.floatX)



    overall_vocab_size=len(word2id)
    print 'train size:', train_size, 'test size:', test_size, 'vocab size:', overall_vocab_size


    rand_values=random_value_normal((overall_vocab_size, emb_size), theano.config.floatX, rng)
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=rand_values, borrow=True)
    
    overall_trichar_size = len(trichar2id)
    char_rand_values=random_value_normal((overall_trichar_size, char_emb_size), theano.config.floatX, rng)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)


    para=T.imatrix()  #(2*batch, len)
    para_shape = T.imatrix()
    para_mask=T.fmatrix() #(2*batch, len)
    
    q=T.imatrix()  #(2*batch, len_q)
    q_shape = T.imatrix()
    q_mask=T.fmatrix() #(2*batch, len_q)
    islabels = T.fmatrix()
    labels=T.ivector() #batch

    types=T.imatrix()
    types_shape=T.imatrix()

    q_trichar_ids = T.imatrix()
    q_trichar_masks =T.fmatrix()
    para_trichar_ids = T.imatrix()
    para_trichar_masks =T.fmatrix()
    type_trichar_ids = T.imatrix()
    type_trichar_masks =T.fmatrix()

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    true_batch_size = para.shape[0]
    
    paragraph_input = embeddings[para.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len)
    q_input = embeddings[q.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len)
    q_types = embeddings[types.flatten()].reshape((true_batch_size, 2, emb_size)).transpose((0, 2,1))

    paragraph_input_shape = embeddings[para_shape.flatten()].reshape((true_batch_size, para_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, para_len)
    q_input_shape = embeddings[q_shape.flatten()].reshape((true_batch_size, question_len, emb_size)).transpose((0, 2,1)) # (batch, emb_size, question_len)
    q_types_shape = embeddings[types_shape.flatten()].reshape((true_batch_size, 2, emb_size)).transpose((0, 2,1))

    paragraph_input_trichar = char_embeddings[para_trichar_ids.flatten()].reshape((true_batch_size, para_len*trichar_len, char_emb_size)) #(batch, char_emb_size, para_len*trichar_len)
    q_input_trichar = char_embeddings[q_trichar_ids.flatten()].reshape((true_batch_size, question_len*trichar_len, char_emb_size)) # (batch, emb_size, question_len)
    q_types_trichar = char_embeddings[type_trichar_ids.flatten()].reshape((true_batch_size, 2*trichar_len, char_emb_size))
    #sum up trichar emb as word level embs
    paragraph_input_trichar=T.sum((paragraph_input_trichar*para_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, para_len, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,para_len)
    q_input_trichar=T.sum((q_input_trichar*q_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, question_len, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,q_len)
    q_types_trichar=T.sum((q_types_trichar*type_trichar_masks.dimshuffle(0,1,'x')).reshape((true_batch_size, 2, trichar_len,char_emb_size)),axis=2).dimshuffle(0,2,1) #(true_batch_size, char_emb_size,2)
            
    
    #concatenate word emb with shape emb
    q_input = T.concatenate([q_input,q_input_shape, q_input_trichar],axis=1) #(batch, 2*emb_size+char_emb_size, q_len)
    paragraph_input = T.concatenate([paragraph_input,paragraph_input_shape, paragraph_input_trichar,islabels.dimshuffle(0,'x',1)],axis=1)#(batch, 2*emb_size+char_emb_size+1, para_len)
    q_types_input = T.sum(T.concatenate([q_types,q_types_shape,q_types_trichar],axis=1), axis=2) #(batch, 2*emb+char_emb_size)
    
    fwd_LSTM_para_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size+1, hidden_size)
    bwd_LSTM_para_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size+1, hidden_size)
    paragraph_para=fwd_LSTM_para_dict.values()+ bwd_LSTM_para_dict.values()# .values returns a list of parameters
    paragraph_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(paragraph_input, para_mask,  hidden_size, fwd_LSTM_para_dict, bwd_LSTM_para_dict)
    paragraph_reps_tensor3=paragraph_model.output_tensor #(batch, 2*hidden, paralen)
    

    fwd_LSTM_q_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size, hidden_size)
    bwd_LSTM_q_dict=create_LSTM_para(rng, 2*emb_size+char_emb_size, hidden_size)
    question_para=fwd_LSTM_q_dict.values()+ bwd_LSTM_q_dict.values()# .values returns a list of parameters
    questions_model=Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(q_input, q_mask,  hidden_size, fwd_LSTM_q_dict, bwd_LSTM_q_dict)
    q_reps=questions_model.output_sent_rep_maxpooling #(batch, 2*hidden)

    #interaction
    batch_ids=T.arange(true_batch_size)
#     c_heads=theano.shared(value=np.asarray([(para_len-1)/2]*batch_size, dtype='int32'), borrow=True)
    c_heads = T.repeat(theano.shared(value=np.asarray([(para_len-1)/2], dtype='int32'), borrow=True), true_batch_size)
    c_tails=c_heads+1
    c_heads_reps=paragraph_reps_tensor3[batch_ids,:,c_heads] #(batch, 2*hidden)
    c_tails_reps=paragraph_reps_tensor3[batch_ids,:,c_tails] #(batch, 2*hidden)
    candididates_reps=T.concatenate([c_heads_reps, c_tails_reps], axis=1) #(batch, 4*hidden)
    context_l=paragraph_model.forward_output[batch_ids,:,c_heads-1] #(batch, hidden)
    context_r=paragraph_model.backward_output[batch_ids,:,c_tails+1]#(batch, hidden)

    
    
    #glove level average
#     c_input = embeddings[c_ids.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len)
#     c_input_shape = embeddings[c_ids_shape.flatten()].reshape((true_batch_size, c_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len)
#     c_input = T.concatenate([c_input,c_input_shape],axis=1)
    c_sum = paragraph_input[:,:-1,(para_len-1)/2]#(batch, 2*emb_size+char_emb)
    c_sum_with_isInQLabel = paragraph_input[:,:,(para_len-1)/2]

   

#     e_input = embeddings[e_ids.flatten()].reshape((true_batch_size, e_len, emb_size)).transpose((0, 2,1)) #(batch, emb_size, c_len)
    q_sum = T.sum(q_input*q_mask.dimshuffle(0,'x',1), axis=2) #(batch, 2*emb_size+char_emb_size)
#     average_Q_batch = q_sum/T.sqrt(T.sum(q_sum**2, axis=1)+1e-20).dimshuffle(0,'x')      


    HL_layer_1_input_size=2*hidden_size+4*hidden_size+(2*emb_size+char_emb_size+1)+(2*emb_size+char_emb_size)+1+hidden_size+hidden_size+(2*emb_size+char_emb_size)+1
    cosine_Qtype_cand = cosine_row_wise_twoMatrix(q_types_input, c_sum).dimshuffle(0,'x') #(batch, 1)
    #, average_E_batch, average_C_batch, average_Q_batch
    HL_layer_1_input = T.concatenate([q_reps, candididates_reps, c_sum_with_isInQLabel, q_sum, islabels[:,(para_len-1)/2:(para_len-1)/2+1], context_l, context_r,
                                      q_types_input,
                                      cosine_Qtype_cand], axis=1) 
    
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=HL_hidden_size, activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=HL_hidden_size, n_out=HL_hidden_size, activation=T.tanh)
        

    
    
    LR_input= T.concatenate([HL_layer_1.output, HL_layer_2.output, islabels[:,(para_len-1)/2:(para_len-1)/2+1], cosine_Qtype_cand], axis=1) #(batch, char_HL_hidden_size+HL_hidden_size)
    LR_input_size= HL_hidden_size+HL_hidden_size+1+1#HL_layer_1_input_size+2*HL_hidden_size
    U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
    norm_U_a=normalize_matrix(U_a)
    LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='char_LR_b', borrow=True)  #bias for each target class  
    LR_para=[U_a, LR_b]    
    
    
    
    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=norm_U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    
    
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    






    params = LR_para+[embeddings,char_embeddings]+paragraph_para+question_para+HL_layer_1.params+HL_layer_2.params
#     load_model_from_file(storePath+'Best_Paras_HS_20170316_0.760357142857', params)
    
#     L2_reg =L2norm_paraList([embeddings,U1, W1, U1_b, W1_b,UQ, WQ , UQ_b, WQ_b, W_a1, W_a2, U_a])
#     L2_reg = L2norm_paraList(params)
    cost=loss#+1e-6*L2_reg


    accumulator=[]
    for para_i in params:
        eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
#         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-20)))   #AdaGrad
        updates.append((acc_i, acc))



    train_model = theano.function([para, para_shape, para_mask,q,q_shape, q_mask,islabels, labels, types, types_shape, q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks], cost, updates=updates,on_unused_input='ignore')

#     train_model_pred = theano.function([para, para_mask, c_ids,c_mask,e_ids,e_mask, c_heads, c_tails, l_heads, l_tails, e_heads, e_tails, q, q_mask,labels], layer_LR.y_pred, on_unused_input='ignore')


    test_model = theano.function([para, para_shape, para_mask, q,q_shape,  q_mask,islabels, labels, types, types_shape,q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks], [layer_LR.errors(labels),layer_LR.prop_for_posi], on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless


    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches=train_size/batch_size    #batch_size means how many pairs
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] 

#     n_train_batches_pred=train_size/batch_size_pred    #batch_size means how many pairs
#     train_batch_start_pred=list(np.arange(n_train_batches_pred)*batch_size_pred)+[train_size-batch_size_pred] 

    n_test_batches=test_size/test_batch_size    #batch_size means how many pairs
    n_test_remain=test_size%test_batch_size    #batch_size means how many pairs
    test_batch_start=list(np.arange(n_test_batches)*test_batch_size)+[test_size-test_batch_size]




    max_acc=0.0
    cost_i=0.0
    train_ids = range(train_size)
#     train_ids_pred = range(train_size)
    best_test_statistic=defaultdict(int)
#     best_train_statistic=defaultdict(int)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        random.shuffle(train_ids)
#         print train_ids[:100]
        iter_accu=0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1

            iter_accu+=1
            train_id_list = train_ids[para_id:para_id+batch_size]
#             print 'train_labels[train_id_list]:', train_labels[train_id_list]
            if model_type=='train':
                #para, para_shape, para_mask,q,q_shape, q_mask,islabels, labels, types, types_shape, q_trichar_ids,q_trichar_masks,para_trichar_ids,para_trichar_masks,type_trichar_ids,type_trichar_masks
                cost_i+= train_model(
                                    train_paras[train_id_list],
                                    train_paras_shape[train_id_list],
                                    train_paras_mask[train_id_list],

                                    train_questions[train_id_list],
                                    train_questions_shape[train_id_list],
                                    train_questions_mask[train_id_list],
                                    
                                    train_islabels[train_id_list],
                                    train_labels[train_id_list],
                                    train_types[train_id_list],
                                    train_types_shape[train_id_list],
                                    
                                    train_question_trichar_ids[train_id_list],
                                    train_question_trichar_masks[train_id_list],
                                    train_para_trichar_ids[train_id_list],
                                    train_para_trichar_masks[train_id_list],
                                    train_type_trichar_ids[train_id_list],
                                    train_type_trichar_masks[train_id_list])

            #print iter
            if  iter%10 ==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+'/'+str(len(train_batch_start))+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'

                past_time = time.time()
                print 'Testing...'
                error=0
                test_statistic=defaultdict(int)
                if model_type=='test':
                    writefile=open(storePath+'predictions_20170317.txt', 'w')
                for id, test_para_id in enumerate(test_batch_start):
                    test_id_list = range(test_para_id, test_para_id+test_batch_size)   
#                     print 'test_id_list:',test_id_list    
#                     print 'test_c_heads[test_id_list]', test_c_heads[test_id_list]
#                     gold_labels_list = test_labels_3c[test_para_id:test_para_id+test_batch_size]
                    error_i, preds_i= test_model(
                                        test_paras[test_id_list],
                                        test_paras_shape[test_id_list],
                                        test_paras_mask[test_id_list],

                                        test_questions[test_id_list],
                                        test_questions_shape[test_id_list],
                                        test_questions_mask[test_id_list],
                                        
                                        test_islabels[test_id_list],
                                        test_labels[test_id_list],
                                        test_types[test_id_list],
                                        test_types_shape[test_id_list],
                                        
                                        test_question_trichar_ids[test_id_list],
                                        test_question_trichar_masks[test_id_list],
                                        test_para_trichar_ids[test_id_list],
                                        test_para_trichar_masks[test_id_list],
                                        test_type_trichar_ids[test_id_list],
                                        test_type_trichar_masks[test_id_list])
                    if model_type=='test':
                        if id < len(test_batch_start)-1:
                            writefile.write('\n'.join(map(str,list(preds_i)))+'\n')
                        else:
                            writefile.write('\n'.join(map(str,list(preds_i)[-n_test_remain:]))+'\n')
                    error+=error_i
#                     for ind, gold_label in enumerate(gold_labels_list):
#                         test_statistic[(gold_label, preds_i[ind])]+=1
                if model_type=='test':
                    writefile.close()
                acc=1.0-error*1.0/len(test_batch_start)
#                 acc= (test_statistic.get((1,1),0)+test_statistic.get((0,0),0))*1.0/(test_statistic.get((1,1),0)+test_statistic.get((0,0),0)+test_statistic.get((1,0),0)+test_statistic.get((0,1),0))
                
                if acc> max_acc:
                    max_acc=acc
#                     best_test_statistic=test_statistic
                    if model_type=='train':
                        store_model_to_file(storePath+'Best_Paras_HS_20170324_'+str(max_acc), params)
                        print 'Finished storing best  params at:', max_acc
                print 'current average acc:', acc, '\t\tmax acc:', max_acc#, '\ttest_statistic:', test_statistic
#                 print '\t\t\t\tbest statistic:', best_test_statistic
                if model_type=='test':
                    exit(0)



            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #12
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=2000,
                    batch_size=300,
                    test_batch_size=10000,
                    emb_size=50,
                    hidden_size=50,
                    L2_weight=0.0001,
                    para_len_limit=70,
                    q_len_limit=20,
                    pred_q_len_limit=50,
                    top_n_Qwords=1):

    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/SQuAD/'
    rng = np.random.RandomState(23455)

    word2id = {}
    train_para_list, train_para_mask, train_Q_list, train_Q_mask, train_start_list, train_end_list, _, word2id = load_QGQA(
        word2id, para_len_limit, q_len_limit, top_n_Qwords, True)
    train_size = len(train_para_list)
    if train_size != len(train_Q_list) or train_size != len(
            train_start_list) or train_size != len(train_para_mask):
        print 'train_size!=len(Q_list) or train_size!=len(label_list) or train_size!=len(para_mask)'
        exit(0)

    test_para_list, test_para_mask, test_Q_list, test_Q_mask, test_start_list, test_end_list, _, word2id = load_QGQA(
        word2id, para_len_limit, q_len_limit, top_n_Qwords, False)
    test_size = len(test_para_list)

    train_para_list = np.asarray(train_para_list, dtype='int32')
    train_para_mask = np.asarray(train_para_mask, dtype=theano.config.floatX)

    train_Q_list = np.asarray(train_Q_list, dtype='int32')
    train_Q_mask = np.asarray(train_Q_mask, dtype=theano.config.floatX)

    train_start_list = np.asarray(train_start_list, dtype='int32')
    train_end_list = np.asarray(train_end_list, dtype='int32')

    test_para_list = np.asarray(test_para_list, dtype='int32')
    test_para_mask = np.asarray(test_para_mask, dtype=theano.config.floatX)

    test_Q_list = np.asarray(test_Q_list, dtype='int32')
    test_Q_mask = np.asarray(test_Q_mask, dtype=theano.config.floatX)

    test_start_list = np.asarray(test_start_list, dtype='int32')
    test_end_list = np.asarray(test_end_list, dtype='int32')

    vocab_size = len(word2id) + 1

    #     shared_decoder_mask = [0]*vocab_size
    #     shared_decoder_mask[0]=1#we need this pad token in generated text
    #     for id in train_top_Q_wordids:
    #         shared_decoder_mask[id]=1
    #     shared_decoder_mask=theano.shared(value=np.asarray(shared_decoder_mask, dtype='int32'), borrow=True)  #

    rand_values = random_value_normal((vocab_size, emb_size),
                                      theano.config.floatX,
                                      np.random.RandomState(1234))
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_glove()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(value=rand_values, borrow=True)

    train_top_Q_wordids = set()
    wh_words = [
        'What', 'Which', 'Where', 'When', 'Who', 'Whom', 'Whose', 'Why', 'How',
        'far', 'many', 'much', 'long'
    ]
    for word in wh_words:
        idd = word2id.get(word)
        if idd is not None:
            train_top_Q_wordids.add(idd)
        iddd = word2id.get(word.lower())
        if iddd is not None:
            train_top_Q_wordids.add(iddd)

    paragraph = T.imatrix('paragraph')
    questions_encoderIDs = T.imatrix()  # is ground truth,
    questions_decoderIDS = T.imatrix(
    )  #note we convert then from encoder vocab id to decoder vocab id
    decoder_vocab = T.ivector()
    decoder_mask = T.fmatrix()  #(batch, decoder_vocab_size)
    start_indices = T.ivector()  #batch
    end_indices = T.ivector()  #batch
    para_mask = T.fmatrix('para_mask')
    q_mask = T.fmatrix('q_mask')

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    true_batch_size = paragraph.shape[0]
    paragraph_input = embeddings[paragraph.flatten()].reshape(
        (true_batch_size, para_len_limit,
         emb_size)).dimshuffle(0, 2, 1)  #(batch, emb_size, para_len)
    q_input = embeddings[questions_encoderIDs.flatten()].reshape(
        (true_batch_size, q_len_limit, emb_size)).dimshuffle(0, 2, 1)
    decoder_vocab_embs = embeddings[decoder_vocab]

    fwd_LSTM_para_dict = create_LSTM_para(rng, emb_size, hidden_size)
    bwd_LSTM_para_dict = create_LSTM_para(rng, emb_size, hidden_size)
    paragraph_para = fwd_LSTM_para_dict.values() + bwd_LSTM_para_dict.values(
    )  # .values returns a list of parameters
    paragraph_model = Bd_LSTM_Batch_Tensor_Input_with_Mask_Concate(
        paragraph_input, para_mask, hidden_size, fwd_LSTM_para_dict,
        bwd_LSTM_para_dict)
    paragraph_reps_tensor3 = paragraph_model.output_tensor  #(batch, 2*hidden, paralen)

    batch_ids = T.arange(true_batch_size)
    ans_heads = paragraph_reps_tensor3[batch_ids, :, start_indices]
    ans_tails = paragraph_reps_tensor3[batch_ids, :, end_indices]

    l_context_heads = paragraph_reps_tensor3[:, :, 0]
    l_context_tails = paragraph_reps_tensor3[batch_ids, :, start_indices - 1]

    r_context_heads = paragraph_reps_tensor3[batch_ids, :, end_indices + 1]
    r_context_tails = paragraph_reps_tensor3[:, :, -1]

    encoder_reps = T.concatenate([
        l_context_heads, l_context_tails, ans_heads, ans_tails,
        r_context_heads, r_context_tails
    ],
                                 axis=1)  #(batch, 6*2hidden_size)

    decoder_para_dict = create_LSTM_para(rng, emb_size + 12 * hidden_size,
                                         emb_size)

    attention_para_dict1 = create_LSTM_para(rng, 2 * hidden_size, hidden_size)
    attention_para_dict2 = create_LSTM_para(rng, 2 * hidden_size, hidden_size)
    '''
    train
    '''
    groundtruth_as_input = T.concatenate([
        T.alloc(np.asarray(0., dtype=theano.config.floatX), true_batch_size,
                emb_size, 1), q_input[:, :, :-1]
    ],
                                         axis=2)
    #     decoder =  LSTM_Decoder_Train_with_Mask(groundtruth_as_input, encoder_reps, decoder_vocab_embs, q_mask, emb_size, decoder_para_dict)
    #X, Encoder_Tensor_Rep, Encoder_Mask, start_indices, end_indices, vocab_embs, Mask, emb_size, hidden_size, tparams, attention_para_dict1, attention_para_dict2
    decoder = LSTM_Decoder_Train_with_Attention(
        groundtruth_as_input, paragraph_reps_tensor3, para_mask, start_indices,
        end_indices, decoder_vocab_embs, q_mask, emb_size, hidden_size,
        decoder_para_dict, attention_para_dict1, attention_para_dict2)

    prob_matrix = decoder.prob_matrix  #(batch*senlen, decoder_vocab_size)
    probs = prob_matrix[T.arange(true_batch_size * q_len_limit),
                        questions_decoderIDS.flatten()]
    mask_probs = probs[(q_mask.flatten()).nonzero()]
    #we shift question word ids so that in current step, the prob of previsouly predicted id gets lower and lower
    shifted_question_ids = T.concatenate([
        T.alloc(np.asarray(0, dtype='int32'), true_batch_size, 1),
        questions_decoderIDS[:, :-1]
    ],
                                         axis=1)
    probs_to_minimize = prob_matrix[T.arange(true_batch_size * q_len_limit),
                                    shifted_question_ids.flatten()]
    mask_probs_to_minimize = probs_to_minimize[(q_mask.flatten()).nonzero()]

    #loss train

    loss = -T.mean(T.log(mask_probs)) + T.mean(T.exp(mask_probs_to_minimize))
    cost = loss  #+ConvGRU_1.error#
    params = [embeddings] + paragraph_para + decoder_para_dict.values(
    ) + attention_para_dict1.values() + attention_para_dict2.values()

    accumulator = []
    for para_i in params:
        eps_p = np.zeros_like(para_i.get_value(borrow=True),
                              dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        #         print grad_i.type
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i /
                        (T.sqrt(acc) + 1e-8)))  #AdaGrad
        updates.append((acc_i, acc))

#     #test decoder mask
#     raw_masks = T.zeros((true_batch_size, vocab_size), dtype='int32')
#     x_axis = T.repeat(T.arange(true_batch_size).dimshuffle(0,'x'), paragraph.shape[1], axis=1)
#     input_specific_masks = T.set_subtensor(raw_masks[x_axis.flatten(),paragraph.flatten()],1)
#     overall_test_decoder_mask = T.or_(input_specific_masks, shared_decoder_mask.dimshuffle('x',0))  #(batch, vocab_size)
#     overall_test_decoder_mask=(1.0-overall_test_decoder_mask)*(overall_test_decoder_mask-10)
    '''
    testing
    '''
    #     test_decoder =  LSTM_Decoder_Test_with_Mask(q_len_limit, encoder_reps, decoder_vocab_embs, emb_size, decoder_para_dict)
    #nsteps, Encoder_Tensor_Rep, Encoder_Mask, start_indices, end_indices, vocab_embs, emb_size,hidden_size, tparams,attention_para_dict1, attention_para_dict2
    test_decoder = LSTM_Decoder_Test_with_Attention(
        pred_q_len_limit, paragraph_reps_tensor3, para_mask, start_indices,
        end_indices, decoder_vocab_embs, decoder_mask, emb_size, hidden_size,
        decoder_para_dict, attention_para_dict1, attention_para_dict2)
    predictions = test_decoder.output_id_matrix  #(batch, q_len_limit)

    train_model = theano.function([
        paragraph, questions_encoderIDs, questions_decoderIDS, decoder_vocab,
        start_indices, end_indices, para_mask, q_mask
    ],
                                  cost,
                                  updates=updates,
                                  on_unused_input='ignore')

    test_model = theano.function([
        paragraph, decoder_vocab, decoder_mask, start_indices, end_indices,
        para_mask
    ],
                                 predictions,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless

    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    #para_list, Q_list, label_list, mask, vocab_size=load_train()
    n_train_batches = train_size / batch_size
    #     remain_train=train_size%batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]

    n_test_batches = test_size / test_batch_size
    remain_test = test_size % test_batch_size
    test_batch_start = list(np.arange(n_test_batches) *
                            test_batch_size) + [test_size - remain_test]

    max_bleuscore = 0.0
    max_exact_acc = 0.0
    cost_i = 0.0
    train_ids = range(train_size)
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        random.shuffle(train_ids)
        iter_accu = 0
        for para_id in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            sub_Qs = train_Q_list[para_id:para_id + batch_size]
            decoder_vocab_set = train_top_Q_wordids | set(
                list(np.unique(sub_Qs)))
            decoder_vocab_batch = sorted(
                decoder_vocab_set)  # a list of ids in order
            map_encoderid2decoderid = {}
            for encoderID in decoder_vocab_set:
                decoderID = decoder_vocab_batch.index(encoderID)
                map_encoderid2decoderid[encoderID] = decoderID
            Decoder_train_Q_list = []
            for id in sub_Qs.flatten():
                Decoder_train_Q_list.append(map_encoderid2decoderid.get(id))
            Decoder_train_Q_list = np.asarray(Decoder_train_Q_list,
                                              dtype='int32').reshape(
                                                  (batch_size,
                                                   sub_Qs.shape[1]))
            decoder_vocab_batch = np.asarray(decoder_vocab_batch,
                                             dtype='int32')

            cost_i += train_model(
                train_para_list[para_id:para_id + batch_size],
                train_Q_list[para_id:para_id + batch_size],
                Decoder_train_Q_list, decoder_vocab_batch,
                train_start_list[para_id:para_id + batch_size],
                train_end_list[para_id:para_id + batch_size],
                train_para_mask[para_id:para_id + batch_size],
                train_Q_mask[para_id:para_id + batch_size])

            #print iter
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                #                 print 'Testing...'
                past_time = time.time()
                outputfile = codecs.open('output.txt', 'w', 'utf-8')
                referencefile = codecs.open('reference.txt', 'w', 'utf-8')

                bleu_scores = []
                for idd, test_para_id in enumerate(test_batch_start):
                    sub_Qs = test_Q_list[test_para_id:test_para_id +
                                         test_batch_size]
                    decoder_vocab_set = train_top_Q_wordids | set(
                        list(np.unique(sub_Qs)))
                    decoder_vocab_batch = sorted(
                        decoder_vocab_set)  # a list of ids in order

                    map_decoderid2encoderid = {}
                    for encoderID in decoder_vocab_set:
                        decoderID = decoder_vocab_batch.index(encoderID)
                        map_decoderid2encoderid[decoderID] = encoderID

                    if idd == len(test_batch_start) - 1:
                        true_test_batch_size = remain_test
                    else:
                        true_test_batch_size = test_batch_size
                    #decoder mask batch
                    decoder_mask_batch = []
                    for i in range(true_test_batch_size):
                        Q_i = test_Q_list[test_para_id + i]
                        decoder_vocab_Q = train_top_Q_wordids | set(
                            list(np.unique(Q_i)))
                        decoder_mask_ind = []
                        for ele in decoder_vocab_batch:
                            if ele in decoder_vocab_Q:
                                decoder_mask_ind.append(1.0)
                            else:
                                decoder_mask_ind.append(0.0)
                        decoder_mask_batch.append(decoder_mask_ind)
                    decoder_mask_batch = np.asarray(decoder_mask_batch,
                                                    dtype=theano.config.floatX)

                    decoder_vocab_batch = np.asarray(decoder_vocab_batch,
                                                     dtype='int32')
                    pred_id_in_batch = test_model(
                        test_para_list[test_para_id:test_para_id +
                                       test_batch_size], decoder_vocab_batch,
                        decoder_mask_batch,
                        test_start_list[test_para_id:test_para_id +
                                        test_batch_size],
                        test_end_list[test_para_id:test_para_id +
                                      test_batch_size],
                        test_para_mask[test_para_id:test_para_id +
                                       test_batch_size])  #(batch, senlen)
                    ground_truths = sub_Qs
                    ground_mask = test_Q_mask[test_para_id:test_para_id +
                                              test_batch_size]

                    back_pred_id_in_batch = [
                        map_decoderid2encoderid.get(id)
                        for id in pred_id_in_batch.flatten()
                    ]

                    for i in range(true_test_batch_size):
                        #                         print 'pred_id_in_batch[i]:', pred_id_in_batch[i]
                        refined_preds, refined_g = refine_decoder_predictions(
                            back_pred_id_in_batch[i *
                                                  pred_q_len_limit:(i + 1) *
                                                  pred_q_len_limit],
                            ground_truths[i], ground_mask[i])
                        #                         bleu_i = nltk.translate.bleu_score.sentence_bleu([refined_g], refined_preds)
                        #                         bleu_scores.append(bleu_i)
                        pred_q = ''
                        prev_w = ''
                        for id in refined_preds:
                            word = id2word.get(id)
                            if word.isalnum():
                                if word != prev_w:
                                    pred_q += ' ' + word
                                    prev_w = word
                        outputfile.write(pred_q + ' ?\n')
                        referencefile.write(
                            ' '.join([id2word.get(id)
                                      for id in refined_g]) + '\n')

#                 bleuscore =  np.average(np.array(bleu_scores))
                outputfile.close()
                referencefile.close()
                system('perl multi-bleu.perl reference.txt < output.txt')


#                 if max_bleuscore < bleuscore:
#                     max_bleuscore = bleuscore
#                 print '\t\t\t\t\t\t current bleu: ', bleuscore, ' ; max bleu:', max_bleuscore

            if patience <= iter:
                done_looping = True
                break

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))