Example #1
0
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=4,
                    L2_weight=1e-5,
                    extra_size=4,
                    emb_size=300,
                    batch_size=100,
                    filter_size=[3, 3],
                    maxSentLen=40,
                    hidden_size=[300, 300],
                    max_term_len=4,
                    p_mode='conc'):

    model_options = locals().copy()
    print "model options", model_options

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results

    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1, all_word2, all_word1_mask, all_word2_mask, all_labels, all_extra, word2id = load_wordnet_hyper_vs_all_with_words(
        maxlen=maxSentLen, wordlen=max_term_len
    )  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1, test_word2, test_word1_mask, test_word2_mask, test_labels, test_extra, word2id = load_task_hyper_vs_all_with_words(
        LenciBenotto_file, maxSentLen, word2id, wordlen=max_term_len)
    store_word2id(word2id, root_dic + 'LenciBenotto_word2id.pkl')
    # exit(0)
    total_size = len(all_sentences_l)
    hold_test_size = 10000
    train_size = total_size - hold_test_size

    train_sents_l = np.asarray(all_sentences_l[:train_size], dtype='int32')
    # dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    # test_sents_l=np.asarray(all_sentences_l[-test_size:], dtype='int32')
    test_sents_l = np.asarray(test_sents_l, dtype='int32')

    train_masks_l = np.asarray(all_masks_l[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    # test_masks_l=np.asarray(all_masks_l[-test_size:], dtype=theano.config.floatX)
    test_masks_l = np.asarray(test_masks_l, dtype=theano.config.floatX)

    train_sents_r = np.asarray(all_sentences_r[:train_size], dtype='int32')
    # dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    # test_sents_r=np.asarray(all_sentences_r[-test_size:], dtype='int32')
    test_sents_r = np.asarray(test_sents_r, dtype='int32')

    train_masks_r = np.asarray(all_masks_r[:train_size],
                               dtype=theano.config.floatX)
    # dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    # test_masks_r=np.asarray(all_masks_r[-test_size:], dtype=theano.config.floatX)
    test_masks_r = np.asarray(test_masks_r, dtype=theano.config.floatX)

    train_word1 = np.asarray(all_word1[:train_size], dtype='int32')
    train_word2 = np.asarray(all_word2[:train_size], dtype='int32')
    test_word1 = np.asarray(test_word1, dtype='int32')
    test_word2 = np.asarray(test_word2, dtype='int32')

    train_word1_mask = np.asarray(all_word1_mask[:train_size],
                                  dtype=theano.config.floatX)
    train_word2_mask = np.asarray(all_word2_mask[:train_size],
                                  dtype=theano.config.floatX)
    test_word1_mask = np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask = np.asarray(test_word2_mask, dtype=theano.config.floatX)

    train_labels_store = np.asarray(all_labels[:train_size], dtype='int32')
    # dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    # test_labels_store=np.asarray(all_labels[-test_size:], dtype='int32')
    test_labels_store = np.asarray(test_labels, dtype='int32')

    train_extra = np.asarray(all_extra[:train_size],
                             dtype=theano.config.floatX)
    test_extra = np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size = len(test_labels_store)
    print 'train size: ', train_size, ' test size: ', test_size

    vocab_size = len(word2id) + 1

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable
    store_model_to_file(root_dic + 'LenciBenotto_best_para_init_embeddings',
                        [init_embeddings])

    #now, start to build the input form of the model
    sents_ids_l = T.imatrix()
    sents_mask_l = T.fmatrix()
    sents_ids_r = T.imatrix()
    sents_mask_r = T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape(
            (batch_size, maxSentLen, emb_size)).dimshuffle(0, 2, 1)

    embed_input_l = embed_input(
        init_embeddings, sents_ids_l
    )  #embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r = embed_input(
        init_embeddings, sents_ids_r
    )  #embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape(
        (batch_size, word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape(
        (batch_size, word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1 * word1_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    word2_embedding = T.sum(embed_word2 * word2_mask.dimshuffle(0, 1, 'x'),
                            axis=1)
    '''create_AttentiveConv_params '''
    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[1], 1,
                                                    emb_size, filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, conv_W_context]
    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=word1_mask,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([
        tt_embeddings_l, tt_embeddings_r, tt_embeddings_l * tt_embeddings_r,
        tt_embeddings_l - tt_embeddings_r
    ],
                         axis=1)

    term_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_word1.dimshuffle(0, 2, 1),
        input_tensor3_r=embed_input_r,
        mask_matrix=word1_mask,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, max_term_len),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([
        td_embeddings_l, td_embeddings_r, td_embeddings_l * td_embeddings_r,
        td_embeddings_l - td_embeddings_r
    ],
                         axis=1)

    def_vs_term_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_word2.dimshuffle(0, 2, 1),
        mask_matrix=sents_mask_l,
        mask_matrix_r=word2_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, max_term_len),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([
        dt_embeddings_l, dt_embeddings_r, dt_embeddings_l * dt_embeddings_r,
        dt_embeddings_l - dt_embeddings_r
    ],
                         axis=1)

    def_vs_def_layer = Conv_for_Pair(
        rng,
        origin_input_tensor3=embed_input_l,
        origin_input_tensor3_r=embed_input_r,
        input_tensor3=embed_input_l,
        input_tensor3_r=embed_input_r,
        mask_matrix=sents_mask_l,
        mask_matrix_r=sents_mask_r,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[1], 1, emb_size, 1),
        W=conv_W,
        b=conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([
        dd_embeddings_l, dd_embeddings_r, dd_embeddings_l * dd_embeddings_r,
        dd_embeddings_l - dd_embeddings_r
    ],
                         axis=1)

    if p_mode == 'conc':
        p = T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4 * 4 * hidden_size[1]
    else:
        p = T.max(T.concatenate([
            p_ww.dimshuffle('x', 0, 1),
            p_wd.dimshuffle('x', 0, 1),
            p_dw.dimshuffle('x', 0, 1),
            p_dd.dimshuffle('x', 0, 1)
        ],
                                axis=0),
                  axis=0)
        p_len = 4 * hidden_size[1]

    # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    # HL_input_size=p_len+1+1
    #
    # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh)

    "form input to LR classifier"
    LR_input = T.concatenate([
        p,
        cosine_matrix1_matrix2_rowwise(word1_embedding,
                                       word2_embedding).dimshuffle(0, 'x'),
        extra.dimshuffle(0, 'x')
    ],
                             axis=1)
    LR_input_size = p_len + 1 + 1
    # LR_input = HL_layer_1.output
    # LR_input_size = hidden_size[1]

    U_a = create_ensemble_para(
        rng, 2, LR_input_size)  # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2, ), dtype=theano.config.floatX),
                         name='LR_b',
                         borrow=True)  #bias for each target class
    LR_para = [U_a, LR_b]

    layer_LR = LogisticRegression(
        rng,
        input=LR_input,
        n_in=LR_input_size,
        n_out=2,
        W=U_a,
        b=LR_b,
        bias=0.25
    )  #basically it is a multiplication between weight matrix and input feature vector
    loss = layer_LR.negative_log_likelihood(
        labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.

    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()

    params = NN_para + LR_para  #[init_embeddings]

    cost = loss  #+L2_weight*L2_reg

    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra, labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    test_model = theano.function([
        sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,
        word2_ids, word1_mask, word2_mask, extra, labels
    ], [layer_LR.errors(labels), layer_LR.y_pred, layer_LR.prop_for_posi],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    # n_dev_batches=dev_size/batch_size
    # dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    if n_test_remain != 0:
        test_batch_start = list(
            np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    else:
        test_batch_start = list(np.arange(n_test_batches) * batch_size)

    # max_acc_dev=0.0
    max_ap_test = 0.0
    max_ap_topk_test = 0.0
    max_f1 = 0.0

    cost_i = 0.0
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed

        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]
            cost_i += train_model(
                train_sents_l[train_id_batch], train_masks_l[train_id_batch],
                train_sents_r[train_id_batch], train_masks_r[train_id_batch],
                train_word1[train_id_batch], train_word2[train_id_batch],
                train_word1_mask[train_id_batch],
                train_word2_mask[train_id_batch], train_extra[train_id_batch],
                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_labels = []
                probs = []
                gold_labels = []
                error_sum = 0.0
                for idd, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    error_i, pred_i, prob_i = test_model(
                        test_sents_l[test_batch_id:test_batch_id + batch_size],
                        test_masks_l[test_batch_id:test_batch_id + batch_size],
                        test_sents_r[test_batch_id:test_batch_id + batch_size],
                        test_masks_r[test_batch_id:test_batch_id + batch_size],
                        test_word1[test_batch_id:test_batch_id + batch_size],
                        test_word2[test_batch_id:test_batch_id + batch_size],
                        test_word1_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_word2_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_extra[test_batch_id:test_batch_id + batch_size],
                        test_labels_store[test_batch_id:test_batch_id +
                                          batch_size])

                    error_sum += error_i
                    pred_labels += list(pred_i)
                    probs += list(prob_i)
                if n_test_remain != 0:
                    probs = probs[:(len(test_batch_start) - 1) *
                                  batch_size] + probs[-n_test_remain:]
                assert len(test_labels) == len(probs)
                # test_acc=1.0-error_sum/(len(test_batch_start))
                test_ap = apk(test_labels, probs, k=len(test_labels))
                test_ap_top100 = apk(test_labels, probs, k=100)

                if test_ap > max_ap_test:
                    max_ap_test = test_ap
                    store_model_to_file(
                        root_dic + 'LenciBenotto_best_para_' +
                        str(max_ap_test), params)
                if test_ap_top100 > max_ap_topk_test:
                    max_ap_topk_test = test_ap_top100
                print '\t\tcurrent ap:', test_ap, ' ; ', '\t\tmax_ap: ', max_ap_test, 'ap@100: ', test_ap_top100, '\tmax_ap@100:', max_ap_topk_test

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Example #2
0
def evaluate_lenet5(learning_rate=0.02, n_epochs=4, L2_weight=1e-5, extra_size=4, emb_size=300, batch_size=100, filter_size=[3,3], maxSentLen=40, hidden_size=[300,300], max_term_len=4, p_mode = 'conc'):

    model_options = locals().copy()
    print "model options", model_options

    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results


    # all_sentences_l, all_masks_l, all_sentences_r, all_masks_r, all_word1,all_word2,all_word1_mask,all_word2_mask,all_labels, all_extra, word2id  =load_wordnet_hyper_vs_all_with_words(maxlen=maxSentLen, wordlen=max_term_len)  #minlen, include one label, at least one word in the sentence
    # test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_labels, word2id  =load_ACE05_dataset(maxSentLen, word2id)
    word2id = load_word2id(root_dic+'LenciBenotto_word2id.pkl')
    test_sents_l, test_masks_l, test_sents_r, test_masks_r, test_word1,test_word2,test_word1_mask,test_word2_mask,test_labels, test_extra, word2id, group_size_list = load_task_hyper_vs_all_with_allDefComb(LenciBenotto_file,maxSentLen, word2id, wordlen=max_term_len)



    test_sents_l=np.asarray(test_sents_l, dtype='int32')

    test_masks_l=np.asarray(test_masks_l, dtype=theano.config.floatX)


    test_sents_r=np.asarray(test_sents_r, dtype='int32')


    test_masks_r=np.asarray(test_masks_r, dtype=theano.config.floatX)


    test_word1=np.asarray(test_word1, dtype='int32')
    test_word2=np.asarray(test_word2, dtype='int32')


    test_word1_mask=np.asarray(test_word1_mask, dtype=theano.config.floatX)
    test_word2_mask=np.asarray(test_word2_mask, dtype=theano.config.floatX)


    test_labels_store=np.asarray(test_labels, dtype='int32')

    test_extra=np.asarray(test_extra, dtype=theano.config.floatX)

    # train_size=len(train_labels_store)
    # dev_size=len(dev_labels_store)
    test_size=len(test_sents_l)
    print ' test size: ', test_size

    vocab_size=len(word2id)+1


    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
    id2word = {y:x for x,y in word2id.iteritems()}
    word2vec=load_word2vec()
    rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable
    # load_model_from_file(root_dic+'Weeds_best_para_init_embeddings', [init_embeddings])

    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    word1_ids = T.imatrix()
    word2_ids = T.imatrix()
    word1_mask = T.fmatrix()
    word2_mask = T.fmatrix()
    extra = T.fvector()
    labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    def embed_input(emb_matrix, sent_ids):
        return emb_matrix[sent_ids.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_input_l=embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_r=embed_input(init_embeddings, sents_ids_r)#embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1)

    embed_word1 = init_embeddings[word1_ids.flatten()].reshape((batch_size,word1_ids.shape[1], emb_size))
    embed_word2 = init_embeddings[word2_ids.flatten()].reshape((batch_size,word2_ids.shape[1], emb_size))
    word1_embedding = T.sum(embed_word1*word1_mask.dimshuffle(0,1,'x'), axis=1)
    word2_embedding = T.sum(embed_word2*word2_mask.dimshuffle(0,1,'x'), axis=1)


    '''create_AttentiveConv_params '''
    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]))
    conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[1], 1, emb_size, 1))

    NN_para=[conv_W, conv_b,conv_W_context]

    '''
    attentive convolution function
    '''
    term_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = word1_mask,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    tt_embeddings_l = term_vs_term_layer.attentive_maxpool_vec_l
    tt_embeddings_r = term_vs_term_layer.attentive_maxpool_vec_r

    p_ww = T.concatenate([tt_embeddings_l,tt_embeddings_r,tt_embeddings_l*tt_embeddings_r,tt_embeddings_l-tt_embeddings_r], axis=1)

    term_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_word1.dimshuffle(0,2,1),
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_word1.dimshuffle(0,2,1),
            input_tensor3_r = embed_input_r,
             mask_matrix = word1_mask,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, max_term_len),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    td_embeddings_l = term_vs_def_layer.attentive_maxpool_vec_l
    td_embeddings_r = term_vs_def_layer.attentive_maxpool_vec_r
    p_wd = T.concatenate([td_embeddings_l,td_embeddings_r,td_embeddings_l*td_embeddings_r,td_embeddings_l-td_embeddings_r], axis=1)


    def_vs_term_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_word2.dimshuffle(0,2,1),
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_word2.dimshuffle(0,2,1),
             mask_matrix = sents_mask_l,
             mask_matrix_r = word2_mask,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, max_term_len),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dt_embeddings_l = def_vs_term_layer.attentive_maxpool_vec_l
    dt_embeddings_r = def_vs_term_layer.attentive_maxpool_vec_r

    p_dw = T.concatenate([dt_embeddings_l,dt_embeddings_r,dt_embeddings_l*dt_embeddings_r,dt_embeddings_l-dt_embeddings_r], axis=1)


    def_vs_def_layer = Conv_for_Pair(rng,
            origin_input_tensor3=embed_input_l,
            origin_input_tensor3_r = embed_input_r,
            input_tensor3=embed_input_l,
            input_tensor3_r = embed_input_r,
             mask_matrix = sents_mask_l,
             mask_matrix_r = sents_mask_r,
             image_shape=(batch_size, 1, emb_size, maxSentLen),
             image_shape_r = (batch_size, 1, emb_size, maxSentLen),
             filter_shape=(hidden_size[1], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[1], 1,emb_size, 1),
             W=conv_W, b=conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    dd_embeddings_l = def_vs_def_layer.attentive_maxpool_vec_l
    dd_embeddings_r = def_vs_def_layer.attentive_maxpool_vec_r
    p_dd = T.concatenate([dd_embeddings_l,dd_embeddings_r,dd_embeddings_l*dd_embeddings_r,dd_embeddings_l-dd_embeddings_r], axis=1)

    if p_mode == 'conc':
        p=T.concatenate([p_ww, p_wd, p_dw, p_dd], axis=1)
        p_len = 4*4*hidden_size[1]
    else:
        p = T.max(T.concatenate([p_ww.dimshuffle('x',0,1),p_wd.dimshuffle('x',0,1),p_dw.dimshuffle('x',0,1),p_dd.dimshuffle('x',0,1)],axis=0), axis=0)
        p_len =4*hidden_size[1]

    # HL_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    # HL_input_size=p_len+1+1
    #
    # HL_layer_1=HiddenLayer(rng, input=HL_input, n_in=HL_input_size, n_out=hidden_size[1], activation=T.tanh)


    "form input to LR classifier"
    LR_input = T.concatenate([p,cosine_matrix1_matrix2_rowwise(word1_embedding,word2_embedding).dimshuffle(0,'x'),extra.dimshuffle(0,'x')],axis=1)
    LR_input_size=p_len+1+1
    # LR_input = HL_layer_1.output
    # LR_input_size = hidden_size[1]

    U_a = create_ensemble_para(rng, 2, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((2,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a, LR_b]


    layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=2, W=U_a, b=LR_b, bias=0.25) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.


    # L2_reg = (conv_W**2).sum()+(conv_W_context**2).sum()+(U_a**2).sum()





    params = NN_para+LR_para #[init_embeddings]


    # load_model_from_file('/save/wenpeng/datasets/HypeNet/HyperDef_label_meta_best_para_0.938730853392', params)
    load_model_from_file(root_dic+'LenciBenotto_best_para_0.557286573332', params)

    '''
    0.552587544259; current ap: 0.574037513126 ap@100 0.918481316424
    0.557286573332; current ap: 0.576498645289 ap@100 0.909032657538
    '''

    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, word1_ids,word2_ids,word1_mask,word2_mask,extra], [layer_LR.y_pred,layer_LR.prop_for_posi], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False


    n_test_batches=test_size/batch_size
    n_test_remain = test_size%batch_size
    if n_test_remain!=0:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]
    else:
        test_batch_start=list(np.arange(n_test_batches)*batch_size)



    # max_acc_dev=0.0
    max_ap_test=0.0
    max_ap_topk_test=0.0
    max_f1=0.0


    pred_labels =[]
    probs = []
    gold_labels =[]
    error_sum=0.0
    for idd, test_batch_id in enumerate(test_batch_start): # for each test batch
        pred_i, prob_i=test_model(
                test_sents_l[test_batch_id:test_batch_id+batch_size],
                test_masks_l[test_batch_id:test_batch_id+batch_size],
                test_sents_r[test_batch_id:test_batch_id+batch_size],
                test_masks_r[test_batch_id:test_batch_id+batch_size],
                test_word1[test_batch_id:test_batch_id+batch_size],
                test_word2[test_batch_id:test_batch_id+batch_size],
                test_word1_mask[test_batch_id:test_batch_id+batch_size],
                test_word2_mask[test_batch_id:test_batch_id+batch_size],
                test_extra[test_batch_id:test_batch_id+batch_size])

        # error_sum+=error_i
        pred_labels+=list(pred_i)
        probs+=list(prob_i)

    print len(test_sents_l), len(probs)
    if n_test_remain !=0:
        probs = probs[:(len(test_batch_start)-1)*batch_size]+probs[-n_test_remain:]
    print len(test_sents_l), len(probs)
    assert len(test_sents_l) == len(probs)
    assert sum(group_size_list) == len(probs)
    #max prob in group
    max_probs = []
    prior_size = 0
    for i in range(len(group_size_list)):

        sub_probs = probs[prior_size:prior_size+group_size_list[i]]
        prior_size += group_size_list[i]
        max_probs.append(max(sub_probs))

    print len(group_size_list),len(max_probs),len(test_labels)
    assert len(test_labels) == len(max_probs)
    # test_acc=1.0-error_sum/(len(test_batch_start))
    test_ap = apk(test_labels, max_probs, k=len(test_labels))
    test_ap_top100 = apk(test_labels, max_probs, k=100)


    # if test_ap > max_ap_test:
    #     max_ap_test=test_ap
    #     store_model_to_file('/save/wenpeng/datasets/EVALution/HyperDef_label_4ways_conc_test_on_EVA_allDefComb_best_para_'+str(max_ap_test), params)
    # if test_ap_top100 > max_ap_topk_test:
    #     max_ap_topk_test=test_ap_top100
    print '\t\tcurrent ap:', test_ap,'ap@100', test_ap_top100
Example #3
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=4, L2_weight=0.001, emb_size=70, batch_size=50, filter_size=3, maxSentLen=50, nn='CNN'):
    hidden_size=emb_size
    model_options = locals().copy()
    print "model options", model_options
    
    rng = np.random.RandomState(1234)    #random seed, control the model generates the same results 


    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r,all_labels, word2id  =load_SNLI_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_sents_l=np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l=np.asarray(all_sentences_l[2], dtype='int32')
    
    train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX)
    
    train_sents_r=np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    test_sents_r=np.asarray(all_sentences_r[2] , dtype='int32')
        
    train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX)
            
    train_labels_store=np.asarray(all_labels[0], dtype='int32')
    dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    test_labels_store=np.asarray(all_labels[2], dtype='int32')
        
    train_size=len(train_labels_store)
    dev_size=len(dev_labels_store)
    test_size=len(test_labels_store)
    
    vocab_size=len(word2id)+1
                    
    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
#     rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
#     id2word = {y:x for x,y in word2id.iteritems()}
#     word2vec=load_word2vec()
#     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable      
    
    
    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'    
    
    common_input_l=embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)) #the input format can be adapted into CNN or GRU or LSTM
    common_input_r=embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size))
    
    #conv
    if nn=='CNN':
        conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size))
        conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
        NN_para=[conv_W, conv_b]
         
        conv_input_l = common_input_l.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen)
        conv_model_l = Conv_with_input_para(rng, input=conv_input_l,
                 image_shape=(batch_size, 1, emb_size, maxSentLen),
                 filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b)
        conv_output_l=conv_model_l.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1)    
        conv_output_into_tensor3_l=conv_output_l.reshape((batch_size, hidden_size, maxSentLen-filter_size+1))
        mask_for_conv_output_l=T.repeat(sents_mask_l[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1)
        mask_for_conv_output_l=(1.0-mask_for_conv_output_l)*(mask_for_conv_output_l-10)
        masked_conv_output_l=conv_output_into_tensor3_l+mask_for_conv_output_l      #mutiple mask with the conv_out to set the features by UNK to zero
        sent_embeddings_l=T.max(masked_conv_output_l, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
     
        conv_input_r = common_input_r.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen)
        conv_model_r = Conv_with_input_para(rng, input=conv_input_r,
                 image_shape=(batch_size, 1, emb_size, maxSentLen),
                 filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b)
        conv_output_r=conv_model_r.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1)    
        conv_output_into_tensor3_r=conv_output_r.reshape((batch_size, hidden_size, maxSentLen-filter_size+1))
        mask_for_conv_output_r=T.repeat(sents_mask_r[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1)
        mask_for_conv_output_r=(1.0-mask_for_conv_output_r)*(mask_for_conv_output_r-10)
        masked_conv_output_r=conv_output_into_tensor3_r+mask_for_conv_output_r      #mutiple mask with the conv_out to set the features by UNK to zero
        sent_embeddings_r=T.max(masked_conv_output_r, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size   
    
     
    #GRU
    if nn=='GRU':
        U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
        NN_para=[U1, W1, b1]     #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
        gru_input_l = common_input_l.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
        gru_layer_l=GRU_Batch_Tensor_Input_with_Mask(gru_input_l, sents_mask_l,  hidden_size, U1, W1, b1)
        sent_embeddings_l=gru_layer_l.output_sent_rep  # (batch_size, hidden_size)
        gru_input_r = common_input_r.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
        gru_layer_r=GRU_Batch_Tensor_Input_with_Mask(gru_input_r, sents_mask_r,  hidden_size, U1, W1, b1)
        sent_embeddings_r=gru_layer_r.output_sent_rep  # (batch_size, hidden_size)


    #LSTM
    if nn=='LSTM':
        LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
        NN_para=LSTM_para_dict.values() # .values returns a list of parameters
        lstm_input_l = common_input_l.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU
        lstm_layer_l=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_l, sents_mask_l,  hidden_size, LSTM_para_dict)
        sent_embeddings_l=lstm_layer_l.output_sent_rep  # (batch_size, hidden_size)   
        lstm_input_r = common_input_r.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU
        lstm_layer_r=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_r, sents_mask_r,  hidden_size, LSTM_para_dict)
        sent_embeddings_r=lstm_layer_r.output_sent_rep  # (batch_size, hidden_size)      
    
    
    HL_layer_1_input = T.concatenate([sent_embeddings_l,sent_embeddings_r, sent_embeddings_l*sent_embeddings_r, cosine_matrix1_matrix2_rowwise(sent_embeddings_l,sent_embeddings_r).dimshuffle(0,'x')],axis=1)
    HL_layer_1_input_size = hidden_size*3+1
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size, activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size, n_out=hidden_size, activation=T.tanh)

    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    LR_input_size=HL_layer_1_input_size+2*hidden_size
    U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class  
    LR_para=[U_a, LR_b]
    
    LR_input=T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1)
    layer_LR=LogisticRegression(rng, input=T.tanh(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    
    params = [embeddings]+NN_para+LR_para+HL_layer_1.params+HL_layer_2.params   # put all model parameters together
#     L2_reg =L2norm_paraList([embeddings,conv_W, U_a])
#     diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix)

    cost=loss#+Div_reg*diversify_reg#+L2_weight*L2_reg
    
    grads = T.grad(cost, params)    # create a list of gradients for all model parameters
    accumulator=[]
    for para_i in params:
        eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #1e-8 is add to get rid of zero division
        updates.append((acc_i, acc))    


    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')    
    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    
    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    n_dev_batches=dev_size/batch_size
    dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]

        
    max_acc_dev=0.0
    max_acc_test=0.0
    
    while epoch < n_epochs:
        epoch = epoch + 1
        train_indices = range(train_size)
        random.Random(200).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        iter_accu=0
        cost_i=0.0
        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]
            cost_i+= train_model(
                                train_sents_l[train_id_batch], 
                                train_masks_l[train_id_batch],
                                train_sents_r[train_id_batch], 
                                train_masks_r[train_id_batch],                                
                                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter%500==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()
#             if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0:
#                 print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
#                 past_time = time.time()

                error_sum=0.0
                for dev_batch_id in dev_batch_start: # for each test batch
                    error_i=dev_model(
                                dev_sents_l[dev_batch_id:dev_batch_id+batch_size], 
                                dev_masks_l[dev_batch_id:dev_batch_id+batch_size],
                                dev_sents_r[dev_batch_id:dev_batch_id+batch_size], 
                                dev_masks_r[dev_batch_id:dev_batch_id+batch_size],                                
                                dev_labels_store[dev_batch_id:dev_batch_id+batch_size]
                                )
                    
                    error_sum+=error_i
                dev_accuracy=1.0-error_sum/(len(dev_batch_start))
                if dev_accuracy > max_acc_dev:
                    max_acc_dev=dev_accuracy
                    print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev
                    #best dev model, do test
                    error_sum=0.0
                    for test_batch_id in test_batch_start: # for each test batch
                        error_i=test_model(
                                test_sents_l[test_batch_id:test_batch_id+batch_size], 
                                test_masks_l[test_batch_id:test_batch_id+batch_size],
                                test_sents_r[test_batch_id:test_batch_id+batch_size], 
                                test_masks_r[test_batch_id:test_batch_id+batch_size],                                
                                test_labels_store[test_batch_id:test_batch_id+batch_size]
                                )
                        
                        error_sum+=error_i
                    test_accuracy=1.0-error_sum/(len(test_batch_start))
                    if test_accuracy > max_acc_test:
                        max_acc_test=test_accuracy
                    print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test
                else:
                    print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev

        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
                    
    return max_acc_test