Ejemplo n.º 1
0
def evaluate_lenet5(claim):
    learning_rate=0.02
    n_epochs=100
    emb_size=300
    batch_size=1#50
    filter_size=[3]
    sent_len=40
    claim_len=40
    cand_size=10
    hidden_size=[300,300]
    max_pred_pick=5

    model_options = locals().copy()
    print("model options", model_options)

    pred_id2label = {1:'SUPPORTS', 0:'REFUTES', 2:'NOT ENOUGH INFO'}

    seed=1234
    np.random.seed(seed)
    rng = np.random.RandomState(seed)    #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    word2id = {}
    read_word2id = codecs.open('/home/wyin3/workspace/FEVER/src/word2id.txt', 'r', 'utf-8')
    for line in read_word2id:
        parts = line.strip().split()
        word2id[parts[0]] = int(parts[1])
    print('word2id load over, size:', len(word2id))
    read_word2id.close()
    claim_idlist, claim_masklist, sent_ins_ids, sent_ins_mask, sent_cand_list = claim_input_2_theano_input(claim, word2id, claim_len, sent_len, cand_size)

    test_claims=np.asarray([claim_idlist], dtype='int32')
    test_claim_mask=np.asarray([claim_masklist], dtype=theano.config.floatX)

    test_sents=np.asarray([sent_ins_ids], dtype='int32')
    test_sent_masks=np.asarray([sent_ins_mask], dtype=theano.config.floatX)


    vocab_size=len(word2id)+1




    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    # id2word = {y:x for x,y in word2id.items()}
    # word2vec=load_word2vec()
    # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable


    '''
    the first block for evidence identification in two classes (support & reject)
    the second block for textual entailment: given evidence labels, predict the claim labels
    '''
    sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    sents_mask=T.ftensor3()
    # sents_labels=T.imatrix() #(batch, cand_size)
    claim_ids = T.imatrix() #(batch, claim_len)
    claim_mask = T.fmatrix()

    # joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len)
    # joint_sents_mask=T.ftensor3()
    # # joint_sents_labels=T.imatrix() #(batch, cand_size)
    # joint_claim_ids = T.imatrix() #(batch, claim_len)
    # joint_claim_mask = T.fmatrix()
    # joint_labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')


    embed_input_sents=init_embeddings[sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim=init_embeddings[claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1)


    "shared parameters"
    conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    "tasl 1 parameters"
    task1_att_conv_W, task1_att_conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    task1_conv_W_context, task1_conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    "task 2 parameters"
    att_conv_W, att_conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]))
    conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para=[conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W, att_conv_b,task1_conv_W_context,conv_W_context]

    conv_model_sents = Conv_with_Mask(rng, input_tensor3=embed_input_sents,
             mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])),
             image_shape=(batch_size*cand_size, 1, emb_size, sent_len),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings=conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape((batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(rng, input_tensor3=embed_input_claim,
             mask_matrix = claim_mask,
             image_shape=(batch_size, 1, emb_size, claim_len),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings=conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0,'x', 1), cand_size, axis=1)

    '''
    attentive conv for task1
    '''
    task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(rng,
            input_tensor3=embed_input_sents, #batch_size*cand_size, emb_size, sent_len
            input_tensor3_r = T.repeat(embed_input_claim, cand_size, axis=0),
             mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])),
             mask_matrix_r = T.repeat(claim_mask,cand_size, axis=0),
             image_shape=(batch_size*cand_size, 1, emb_size, sent_len),
             image_shape_r = (batch_size*cand_size, 1, emb_size, claim_len),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[0], 1,emb_size, 1),
             W=task1_att_conv_W, b=task1_att_conv_b,
             W_context=task1_conv_W_context, b_context=task1_conv_b_context)
    task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r




    concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb, T.sum(batch_claim_emb*batch_sent_emb, axis=2).dimshuffle(0,1,'x')], axis=2)
    concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2+1))
    "to score each evidence sentence, we use the output of attentiveConv, as well as the output of standard CNN"
    LR_input = T.concatenate([concate_2_matrix, task1_attentive_sent_embeddings_l,task1_attentive_sent_embeddings_r], axis=1)
    LR_input_size = hidden_size[0]*2+1 + hidden_size[0]*2

    # LR_input = concate_2_matrix
    # LR_input_size = hidden_size[0]*2+1
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(rng, 1, LR_input_size) # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para=[U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size)
    sents_labels = inter_matrix*binarize_prob

    '''
    training task2, predict 3 labels
    '''
    # joint_embed_input_sents=init_embeddings[joint_sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    # joint_embed_input_claim=init_embeddings[joint_claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1)
    # joint_conv_model_sents = Conv_with_Mask(rng, input_tensor3=joint_embed_input_sents,
    #          mask_matrix = joint_sents_mask.reshape((joint_sents_mask.shape[0]*joint_sents_mask.shape[1],joint_sents_mask.shape[2])),
    #          image_shape=(batch_size*cand_size, 1, emb_size, sent_len),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    # joint_sent_embeddings=joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    # joint_batch_sent_emb = joint_sent_embeddings.reshape((batch_size, cand_size, hidden_size[0]))
    # "??? use joint_sents_labels means the evidence labels are not provided by task 1?"
    # joint_premise_emb = T.sum(joint_batch_sent_emb*joint_sents_labels.dimshuffle(0,1,'x'), axis=1) #(batch, hidden_size)

    premise_emb = T.sum(batch_sent_emb*sents_labels.dimshuffle(0,1,'x'), axis=1)

    # joint_conv_model_claims = Conv_with_Mask(rng, input_tensor3=joint_embed_input_claim,
    #          mask_matrix = joint_claim_mask,
    #          image_shape=(batch_size, 1, emb_size, claim_len),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b)    #mutiple mask with the conv_out to set the features by UNK to zero
    # joint_claim_embeddings=joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    premise_hypo_emb = T.concatenate([premise_emb,claim_embeddings], axis=1) #(batch, 2*hidden_size)
    '''
    attentive conv in task2
    '''
    sents_tensor3 = embed_input_sents.dimshuffle(0,2,1).reshape((batch_size, cand_size*sent_len, emb_size))
    sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(0,2,1)) #(batch_size, cand_size*sent_len, cand_size*sent_len)
    sents_dot_2_matrix = T.nnet.softmax(sents_dot.reshape((batch_size*cand_size*sent_len, cand_size*sent_len)))
    sents_context = T.batched_dot(sents_dot_2_matrix.reshape((batch_size, cand_size*sent_len, cand_size*sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size)
    add_sents_context = embed_input_sents+sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)

    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(rng,
            input_tensor3=add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len
            input_tensor3_r = T.repeat(embed_input_claim, cand_size, axis=0),
             mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])),
             mask_matrix_r = T.repeat(claim_mask,cand_size, axis=0),
             image_shape=(batch_size*cand_size, 1, emb_size, sent_len),
             image_shape_r = (batch_size*cand_size, 1, emb_size, claim_len),
             filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
             filter_shape_context=(hidden_size[0], 1,emb_size, 1),
             W=att_conv_W, b=att_conv_b,
             W_context=conv_W_context, b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape((batch_size, cand_size, hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape((batch_size, cand_size, hidden_size[0]))
    masked_sents_attconv = attentive_sent_embeddings_l*sents_labels.dimshuffle(0,1,'x')
    masked_claim_attconv = attentive_sent_embeddings_r*sents_labels.dimshuffle(0,1,'x')
    fine_max = T.concatenate([T.max(masked_sents_attconv, axis=1),T.max(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    "Logistic Regression layer"
    joint_LR_input = T.concatenate([premise_hypo_emb,fine_max], axis=1)
    joint_LR_input_size=2*hidden_size[0]+2*hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    joint_LR_para=[joint_U_a, joint_LR_b]

    joint_layer_LR=LogisticRegression(rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b) #basically it is a multiplication between weight matrix and input feature vector
    # joint_loss=joint_layer_LR.negative_log_likelihood(joint_labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.







    '''
    testing
    '''
    # binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size

    # masked_inter_matrix = inter_matrix * sents_labels #(batch, cand_size)
    # test_premise_emb = T.sum(batch_sent_emb*masked_inter_matrix.dimshuffle(0,1,'x'), axis=1)
    # test_premise_hypo_emb = T.concatenate([test_premise_emb,claim_embeddings], axis=1)
    #
    # #fine-maxsum
    # sents_tensor3 = embed_input_sents.dimshuffle(0,2,1).reshape((batch_size, cand_size*sent_len, emb_size))
    # sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(0,2,1)) #(batch_size, cand_size*sent_len, cand_size*sent_len)
    # sents_dot_2_matrix = T.nnet.softmax(sents_dot.reshape((batch_size*cand_size*sent_len, cand_size*sent_len)))
    # sents_context = T.batched_dot(sents_dot_2_matrix.reshape((batch_size, cand_size*sent_len, cand_size*sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size)
    # add_sents_context = embed_input_sents+sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len)
    #
    # test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(rng,
    #         input_tensor3=add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len
    #         input_tensor3_r = T.repeat(embed_input_claim, cand_size, axis=0),
    #          mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])),
    #          mask_matrix_r = T.repeat(claim_mask,cand_size, axis=0),
    #          image_shape=(batch_size*cand_size, 1, emb_size, sent_len),
    #          image_shape_r = (batch_size*cand_size, 1, emb_size, claim_len),
    #          filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
    #          filter_shape_context=(hidden_size[0], 1,emb_size, 1),
    #          W=att_conv_W, b=att_conv_b,
    #          W_context=conv_W_context, b_context=conv_b_context)
    # # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    # # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r
    #
    # test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape((batch_size, cand_size, hidden_size[0]))  #(batch_size*cand_size, hidden_size)
    # test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape((batch_size, cand_size, hidden_size[0]))
    # test_masked_sents_attconv = test_attentive_sent_embeddings_l*masked_inter_matrix.dimshuffle(0,1,'x')
    # test_masked_claim_attconv = test_attentive_sent_embeddings_r*masked_inter_matrix.dimshuffle(0,1,'x')
    # test_fine_max = T.concatenate([T.max(test_masked_sents_attconv, axis=1),T.max(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    # # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden)
    #
    #
    # test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max], axis=1)
    # test_LR_input_size = joint_LR_input_size
    #
    # test_layer_LR=LogisticRegression(rng, input=test_LR_input, n_in=test_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b) #basically it is a multiplication between weight matrix and input feature vector



    params = [init_embeddings]+NN_para+LR_para + joint_LR_para
    # print('initialze model parameters...')
    # load_model_from_file('/home1/w/wenpeng/workshop/SciTail/src/model_para_0.8120930232558139', params)

    # train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask,joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([sents_ids,sents_mask, claim_ids,claim_mask], [binarize_prob,joint_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore')
    # dev_model = theano.function([sents_ids,sents_mask, claim_ids,claim_mask], [binarize_prob,joint_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters

    batch_binary_vec, pred_i=test_model(
            test_sents,
            test_sent_masks,
            test_claims,
            test_claim_mask
            )

    print(batch_binary_vec)
    print(batch_binary_vec[0])
    print(pred_i)
    print(pred_id2label.get(pred_i[0]))
Ejemplo n.º 2
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=200,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 10],
                    batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    #Original
    # datasets = load_data(dataset)
    # n_out = 10

    # Images for face recognition
    import pickle
    import Utils_dueo
    datasets = Utils_dueo.load_pictures()
    print("Saveing the pickeled data-set")
    pickle.dump(datasets, open("Dataset_unal_48.p",
                               "wb"))  #Attention y is wrong
    print("Saved the pickeled data-set")

    #Loading the pickled images
    #import pickle
    #datasets = pickle.load(open("Dataset.p", "r"))
    n_out = 6
    batch_size = 20
    n_epochs = 2000
    # Images for face recognition

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (48, 48)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 48, 48))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 48, 48),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 22, 22),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 9 * 9,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=n_out)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(
                        ('     epoch %i, minibatch %i/%i, test error of best '
                         'model %f %%') % (epoch, minibatch_index + 1,
                                           n_train_batches, test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Ejemplo n.º 3
0
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10):
        """This class is made to support a variable number of layers. 

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial 
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is 
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the DBN

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain 
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network
        """

        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels

        # The DBN is an MLP, for which all weights of intermediate layers are shared with a
        # different RBM.  We will first construct the DBN as a deep multilayer perceptron, and
        # when constructing each sigmoidal layer we also construct an RBM that shares weights
        # with that layer. During pretraining we will train these RBMs (which will lead
        # to chainging the weights of the MLP as well) During finetuning we will finish
        # training the DBN by doing stochastic gradient descent on the MLP.

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of the layer below or
            # the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden layer below or the
            # input of the DBN if you are on the first layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)

            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)

            # its arguably a philosophical question...  but we are going to only declare that
            # the parameters of the sigmoid_layers are parameters of the DBN. The visible
            # biases in the RBM are parameters of those RBMs, but not of the DBN.
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=numpy_rng,
                            theano_rng=theano_rng,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(\
                         input = self.sigmoid_layers[-1].output,\
                         n_in = hidden_layers_sizes[-1], n_out = n_outs)
        self.params.extend(self.logLayer.params)

        # compute the cost for second phase of training, defined as the
        # negative log likelihood of the logistic regression (output) layer
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
Ejemplo n.º 4
0
def evaluate_lenet5(train_set,
                    test_set,
                    valid_set,
                    learning_rate=0.1,
                    n_epochs=200,
                    nkerns=[20, 50],
                    batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :param dataset train_set: dataset to use for training.
    :param dataset test_set: dataset to use for testing.
    :param dataset valid_set: dataset to use for validation.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    # create a python generator that returns minibatches one at a time
    def minibatch_generator(dataset):
        dataset_x, dataset_y = dataset
        for i in range(dataset_x.shape[0] // batch_size):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            batch_x = dataset_x[start_idx:end_idx]
            batch_y = dataset_y[start_idx:end_idx]
            yield (batch_x, batch_y)

    x = T.matrix('x')
    y = T.lvector('y')

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=x,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function([x, y], layer3.errors(y))

    validate_model = theano.function([x, y], layer3.errors(y))

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function([x, y], cost, updates=updates)

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is found

    # a relative improvement of this much is considered significant
    improvement_threshold = 0.995

    n_train_batches = (train.num_examples + batch_size - 1) // batch_size

    # go through this many minibatches before checking the network on
    # the validation set; in this case we check every epoch
    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    iter = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1

        minibatch_index = 0
        for minibatch in minibatch_generator(train_set):
            iter += 1
            minibatch_index += 1
            if iter % 100 == 0:
                print('training @ iter = %i' % iter)

            error = train_model(minibatch[0], minibatch[1])

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(vb[0], vb[1])
                    for vb in minibatch_generator(valid_set)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss * \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(tb[0], tb[1])
                        for tb in minibatch_generator(test_set)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print('The code ran for %.2fm' % ((end_time - start_time) / 60.),
          file=sys.stderr)
Ejemplo n.º 5
0
def evaluate_mnist_1(learning_rate=0.1,
                     n_epochs=100,
                     nkerns=[4, 6],
                     batch_size=2):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """
    rng = numpy.random.RandomState(3)
    xs = []
    ys = []
    # f = open('temp_value', 'r+')
    # f = open('out_10', 'r+')
    f = open('out_10_10', 'r+')

    while (1):
        line = f.readline()
        line2 = f.readline()
        if not line:
            break
        line = line.replace("\n", "")

        values = [float(i) for i in line.split()]
        value = float(line2)

        xs.append(values)
        ys.append(value)

    print(len(xs))
    print(len(xs[0]))
    print(len(ys))
    # print(ys)
    # print(xs)

    test_set_x, test_set_y = shared_dataset([xs, ys])
    valid_set_x, valid_set_y = shared_dataset([xs, ys])
    train_set_x, train_set_y = shared_dataset([xs, ys])

    # train_set_x, train_set_y = datasets[0]
    # valid_set_x, valid_set_y = datasets[1]
    # test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    batch_size = len(ys)
    # batch_size=1
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size
    # n_train_batches = 1
    # n_valid_batches = 1
    # n_test_batches = 1

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (28, 28)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 28, 28),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 12, 12),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)
    # myprint=theano.function([x],x)
    # myprint([layer2_input])

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=20,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=20, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
    prob = layer3.prob_y_given_x(y)

    f1 = open('weights', 'w+')
    print "layer 0 weights"
    for w in layer0.W.get_value():
        for r in w:
            for s in r:
                for d in s:
                    f1.write(str(d) + '\n')

    # print layer0.W.get_value()
    # print layer0.b.get_value()
    print "layer 1 weights"
    # print layer1.W.get_value()
    # print layer1.b.get_value()
    for w in layer1.W.get_value():
        for r in w:
            for s in r:
                for d in s:
                    f1.write(str(d) + '\n')

    print "layer 2 weights"
    # print layer2.W.get_value()
    w = layer2.W.get_value()
    # for d in w:
    #     print d
    for i in range(len(w[0])):
        for j in range(len(w)):
            f1.write(str(w[j][i]) + '\n')
    # print layer2.b.get_value()

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    prob_model = theano.function(
        [index],
        prob,
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    conv_model0 = theano.function(
        [index],
        layer0.output,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})
    conv_model0_conv = theano.function(
        [index],
        layer0.conv_out,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})

    conv_model1 = theano.function(
        [index],
        layer1.output,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})
    conv_model1_conv = theano.function(
        [index],
        layer1.conv_out,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})
    conv_model2 = theano.function(
        [index],
        layer2.output,
        givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]})

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params
    # params = layer0.params + layer1.params + layer2.params + layer3.params

    # x_printed = theano.printing.Print('this is a very important value')(x)
    # f_with_print = theano.function([x], x_printed)
    # f_with_print(layer3.params)

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)
    val_grads = T.grad(cost, layer3.p_y_given_x)
    # print "AAAA"
    # theano.printing.debugprint(temp_grads)
    # print "AAAA"

    grad_model = theano.function(
        [index],
        grads,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    val_grad_model = theano.function(
        [index],
        val_grads,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []

    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False
    bestConvW = layer0.W.get_value()

    while (epoch < n_epochs) and (not done_looping):

        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index
            val_grads_ij = val_grad_model(minibatch_index)
            grads_ij = grad_model(minibatch_index)
            conv0_ij = conv_model0(minibatch_index)
            conv1_ij = conv_model1(minibatch_index)
            conv2_ij = conv_model2(minibatch_index)
            conv0_conv_ij = conv_model0_conv(minibatch_index)
            conv1_conv_ij = conv_model1_conv(minibatch_index)

            print 'training @ iter = ', iter
            print "last layer var grads"
            print val_grads_ij[0]

            # print "Layer 0 convolution"
            # for c in conv0_conv_ij[0]:
            #     print c
            #     print ""
            # print ""
            # print "Layer 1 convolution"
            # for c in conv1_conv_ij[0]:
            #     print c
            #     print ""
            # print ""
            probs = prob_model(minibatch_index)
            print "Probs"
            print probs
            # print "layer 0 grads"
            # print grads_ij[6]
            # print grads_ij[7]
            # print "layer 1 grads"
            # print grads_ij[4]
            # print grads_ij[5]
            # print "layer 2 grads"
            # print grads_ij[2]
            # print grads_ij[3]
            print "log reg layer grads"
            print grads_ij[0]
            print grads_ij[1]
            print "Layer 0 output"
            # for c in conv0_ij:
            #     for d in c:
            #         print d
            # print conv0_ij[0][0]
            print "Layer 1 output"
            # print conv1_ij[0][0]
            # for c in conv1_ij:
            #     for d in c:
            #         print d
            print "Layer 2 output"
            # for c in conv2_ij:
            #     print c
            cost_ij = train_model(minibatch_index)

            # for c in conv0_conv_ij[1]:
            #     print c
            #     print ""

            print "learning_rate"
            print learning_rate
            print "layer 0 weights"
            # print layer0.W.get_value()
            # print layer0.b.get_value()
            print "layer 1 weights"
            # print layer1.W.get_value()
            # print layer1.b.get_value()
            print "layer 2 weights"
            w = layer2.W.get_value()
            # print w[0]
            # print w[1]

            # for c in layer2.W.get_value():
            #     print c
            # print layer2.b.get_value()
            print "log reg layer weights"
            print layer3.W.get_value()
            print layer3.b.get_value()
            print "COST"
            print cost_ij

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    bestConvW = layer0.W.get_value()
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print(
                        ('     epoch %i, minibatch %i/%i, test error of best '
                         'model %f %%') % (epoch, minibatch_index + 1,
                                           n_train_batches, test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Ejemplo n.º 6
0
def evaluate_lenet5(learning_rate=0.1,
                    n_epochs=200,
                    dataset=DataSet,
                    nkerns=[20, 50],
                    batch_size=100):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(23455)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]
    print type(train_set_x)

    #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540])
    #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540])
    #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540])

    #train_set_x = train_set_x / 100
    #valid_set_x = valid_set_x / 100
    #test_set_x = test_set_x / 100

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    #n_test_batches /= batch_size
    n_test_batches = (n_test_batches /
                      batch_size) + (n_test_batches % batch_size > 0)

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ishape = (27, 10)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    xinp = x[:, :540]
    layer0_input = xinp.reshape((batch_size, 2, 27, 10))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 2, 27, 10),
                                filter_shape=(nkerns[0], 2, 5, 2),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 11, 4),
                                filter_shape=(nkerns[1], nkerns[0], 5, 2),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1.output.flatten(2)
    layer2_inputT = T.concatenate([layer2_input, x[:, 540:]], axis=1)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_inputT,
                         n_in=(nkerns[1] * 3 * 1) + 12,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=n_out)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)
    #yPred = layer3.ypred(layer2.output)
    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index], [layer3.errors(y), layer3.y_pred],
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    updates = []
    for param_i, grad_i in zip(params, grads):
        updates.append((param_i, param_i - learning_rate * grad_i))

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size][:],
            y: train_set_y[index * batch_size:(index + 1) * batch_size][:]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' % \
                      (epoch, minibatch_index + 1, n_train_batches, \
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    #tm =  test_model(0)

                    yP = numpy.asarray([])
                    test_losses = [
                        test_model(i)[0] for i in xrange(n_test_batches)
                    ]
                    for i in xrange(n_test_batches):
                        yP = numpy.concatenate((yP, test_model(i)[1]))
                    print yP.shape
                    test_score = numpy.mean(test_losses)

                    #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value())
                    y = test_set_y.owner.inputs[0].get_value()

                    I1 = numpy.nonzero(y == 0.0)
                    I2 = numpy.nonzero(y == 1.0)
                    I3 = numpy.nonzero(y == 2.0)
                    I11 = numpy.nonzero(yP[I1[0]] == 0)
                    I12 = numpy.nonzero(yP[I1[0]] == 1)
                    I13 = numpy.nonzero(yP[I1[0]] == 2)
                    I21 = numpy.nonzero(yP[I2[0]] == 0)
                    I22 = numpy.nonzero(yP[I2[0]] == 1)
                    I23 = numpy.nonzero(yP[I2[0]] == 2)
                    I31 = numpy.nonzero(yP[I3[0]] == 0)
                    I32 = numpy.nonzero(yP[I3[0]] == 1)
                    I33 = numpy.nonzero(yP[I3[0]] == 2)

                    acc1 = float(float(I11[0].size) / float(I1[0].size))
                    acc2 = float(float(I22[0].size) / float(I2[0].size))
                    if n_out == 3:
                        acc3 = float(float(I33[0].size) / float(I3[0].size))
                    else:
                        acc3 = 0
                    print((
                        '     epoch %i, minibatch %i/%i, test error of '
                        'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, I11 = %i, I12 = %i, I13 = %i, I21 = %i, I22 = %i, I23 = %i, I31 = %i, I32 = %i, I33 = %i %%'
                    ) % (epoch, minibatch_index + 1, n_train_batches,
                         test_score * 100., acc1 * 100., acc2 * 100.,
                         acc3 * 100, I11[0].size, I12[0].size, I13[0].size,
                         I21[0].size, I22[0].size, I23[0].size, I31[0].size,
                         I32[0].size, I33[0].size))

                    #print(('     epoch %i, minibatch %i/%i, test error of best '
                    #       'model %f %%') %
                    #      (epoch, minibatch_index + 1, n_train_batches,
                    #       test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Ejemplo n.º 7
0
def evaluate_lenet5(learning_rateOld=0.2,
                    n_epochs=1200,
                    nkerns=[48, 128, 192, 192],
                    batch_size=100):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer 这样默认就是第一次20个kernel,第二层50个kernel?
    """
    global G_params
    rng = numpy.random.RandomState(23455)
    learning_rate_decay = 0.998
    initial_learning_rate = 1.0
    #### the params for momentum
    mom_start = 0.5
    mom_end = 0.99
    # for epoch in [0, mom_epoch_interval], the momentum increases linearly
    # from mom_start to mom_end. After mom_epoch_interval, it stay at mom_end
    mom_epoch_interval = batch_size * 5
    squared_filter_length_limit = 15.0
    mom_params = {
        "start": mom_start,
        "end": mom_end,
        "interval": mom_epoch_interval
    }

    valid_set_x, valid_set_y = loadValid()
    test_set_x, test_set_y = loadTest()
    #just init train_set_x and y
    data_x, data_y = loadTrainDataWithIndex(1)

    # compute number of minibatches for training, validation and testing
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    #hard node now
    n_train_batches = 100
    #默认取第一个作为valid batches
    n_valid_batches /= batch_size
    n_test_batches /= batch_size
    print(n_train_batches)
    print(n_valid_batches)
    '''shared_x = theano.shared(numpy.asarray(data_x,
                                               dtype=theano.config.floatX),
                                 borrow=True)
    shared_y = theano.shared(numpy.asarray(data_y,
                                               dtype=theano.config.floatX),
                                 borrow=True)'''

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    epoch = T.scalar()
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels
    learning_rate = theano.shared(
        numpy.asarray(initial_learning_rate, dtype=theano.config.floatX))
    ishape = (50, 50)  # this is the size of MNIST images

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    layer0_input = x.reshape((batch_size, 1, 50, 50))

    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1,28-5+1)=(24,24)
    # maxpooling reduces this further to (24/2,24/2) = (12,12)
    # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, 50, 50),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1,12-5+1)=(8,8)
    # maxpooling reduces this further to (8/2,8/2) = (4,4)
    # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4)
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], 23, 23),
                                filter_shape=(nkerns[1], nkerns[0], 3, 3),
                                poolsize=(2, 2))
    '''layer1_3 = LeNetConvPoolLayer(rng, input=layer1.output,
            image_shape=(batch_size, nkerns[1], 10, 10),
            filter_shape=(nkerns[2], nkerns[1], 3, 3), poolsize=(2, 2))'''

    layer1_3 = LeNetConvPoolLayerNoPooling(rng,
                                           input=layer1.output,
                                           image_shape=(batch_size, nkerns[1],
                                                        10, 10),
                                           filter_shape=(nkerns[2], nkerns[1],
                                                         3, 3))

    layer1_4 = LeNetConvPoolLayer(rng,
                                  input=layer1_3.output,
                                  image_shape=(batch_size, nkerns[2], 8, 8),
                                  filter_shape=(nkerns[3], nkerns[2], 3, 3),
                                  poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size,num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (20,32*4*4) = (20,512)
    layer2_input = layer1_4.output.flatten(2)

    # construct a dropout fully-connected sigmoidal layer
    dropoutlayer2 = DropoutHiddenLayer(rng,
                                       input=layer2_input,
                                       n_in=nkerns[3] * 3 * 3,
                                       n_out=1920,
                                       activation=ReLU)

    # construct a dropout fully-connected sigmoidal layer
    dropoutlayer2_2 = DropoutHiddenLayer(rng,
                                         input=dropoutlayer2.output,
                                         n_in=1920,
                                         n_out=1920,
                                         activation=ReLU)

    # classify the values of the fully-connected sigmoidal layer
    dropoutlayer3 = LogisticRegression(input=dropoutlayer2_2.output,
                                       n_in=1920,
                                       n_out=58)

    # the cost we minimize during training is the NLL of the model
    dropoutcost = dropoutlayer3.negative_log_likelihood(y)

    # construct a dropout fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[3] * 3 * 3,
                         n_out=1920,
                         activation=ReLU,
                         W=dropoutlayer2.W * 0.5,
                         b=dropoutlayer2.b)

    # construct a dropout fully-connected sigmoidal layer
    layer2_2 = HiddenLayer(rng,
                           input=layer2.output,
                           n_in=1920,
                           n_out=1920,
                           activation=ReLU,
                           W=dropoutlayer2_2.W * 0.5,
                           b=dropoutlayer2_2.b)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2_2.output, n_in=1920, n_out=58)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + dropoutlayer2_2.params + dropoutlayer2.params + layer1_4.params + layer1_3.params + layer1.params + layer0.params

    # Compute gradients of the model wrt parameters
    gparams = []
    for param in params:
        # Use the right cost function here to train with or without dropout.
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # ... and allocate mmeory for momentum'd versions of the gradient
    gparams_mom = []
    for param in params:
        gparam_mom = theano.shared(
            numpy.zeros(param.get_value(borrow=True).shape,
                        dtype=theano.config.floatX))
        gparams_mom.append(gparam_mom)

    # Compute momentum for the current epoch
    mom = ifelse(
        epoch < mom_epoch_interval,
        mom_start * (1.0 - epoch / mom_epoch_interval) + mom_end *
        (epoch / mom_epoch_interval), mom_end)

    # Update the step direction using momentum
    updates = OrderedDict()
    for gparam_mom, gparam in zip(gparams_mom, gparams):
        # Misha Denil's original version
        #updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam

        # change the update rule to match Hinton's dropout paper
        updates[gparam_mom] = mom * gparam_mom - (1. -
                                                  mom) * learning_rate * gparam

    # ... and take a step along that direction
    for param, gparam_mom in zip(params, gparams_mom):
        # Misha Denil's original version
        #stepped_param = param - learning_rate * updates[gparam_mom]

        # since we have included learning_rate in gparam_mom, we don't need it
        # here
        stepped_param = param + updates[gparam_mom]

        # This is a silly hack to constrain the norms of the rows of the weight
        # matrices.  This just checks if there are two dimensions to the
        # parameter and constrains it if so... maybe this is a bit silly but it
        # should work for now.
        if param.get_value(borrow=True).ndim == 2:
            #squared_norms = T.sum(stepped_param**2, axis=1).reshape((stepped_param.shape[0],1))
            #scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.)
            #updates[param] = stepped_param * scale

            # constrain the norms of the COLUMNs of the weight, according to
            # https://github.com/BVLC/caffe/issues/109
            col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
            desired_norms = T.clip(col_norms, 0,
                                   T.sqrt(squared_filter_length_limit))
            scale = desired_norms / (1e-7 + col_norms)
            updates[param] = stepped_param * scale
        else:
            updates[param] = stepped_param

    G_params = params
    # create a list of gradients for all model parameters
    #grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i],grads[i]) pairs.
    #updates = []
    #for param_i, grad_i in zip(params, grads):
    #    updates.append((param_i, param_i - learning_rate * grad_i))
    '''train_model = theano.function([epoch], dropoutcost, updates=updates,
          givens={
            x: shared_x,
            y: T.cast(shared_y, 'int32')})'''
    '''train_model = theano.function([epoch, index], dropoutcost, updates=updates,
          givens={
            x: shared_x[index * batch_size: (index + 1) * batch_size],
            y: T.cast(shared_y, 'int32')[index * batch_size: (index + 1) * batch_size]})'''

    decay_learning_rate = theano.function(
        inputs=[],
        outputs=learning_rate,
        updates={learning_rate: learning_rate * learning_rate_decay})
    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 1000000  # look as this many examples regardless
    patience_increase = 2
    improvement_threshold = 0.995
    validation_frequency = min(n_train_batches, patience / 2)

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch_counter = 0
    done_looping = False

    while (epoch_counter < n_epochs) and (not done_looping):
        epoch_counter = epoch_counter + 1
        for j in range(2):
            data_x, data_y = loadTrainDataWithIndex(j + 1)
            shared_x = theano.shared(numpy.asarray(data_x,
                                                   dtype=theano.config.floatX),
                                     borrow=True)
            shared_y = theano.shared(numpy.asarray(data_y,
                                                   dtype=theano.config.floatX),
                                     borrow=True)
            train_model = theano.function(
                [epoch, index],
                dropoutcost,
                updates=updates,
                givens={
                    x:
                    shared_x[index * batch_size:(index + 1) * batch_size],
                    y:
                    T.cast(shared_y, 'int32')[index * batch_size:(index + 1) *
                                              batch_size]
                })
            for minibatch_index in xrange(n_train_batches):
                cost_ij = train_model(epoch_counter, minibatch_index)

        # compute zero-one loss on validation set
        validation_losses = [
            validate_model(i) for i in xrange(n_valid_batches)
        ]
        this_validation_loss = numpy.mean(validation_losses)
        print('epoch %i,  validation error %f %%' % \
                (epoch_counter, \
                this_validation_loss * 100.))
        test_losses = [test_model(i) for i in xrange(n_test_batches)]
        test_score = numpy.mean(test_losses)
        print(('     epoch %i, test error of best '
               'model %f %%') % (epoch_counter, test_score * 100.))
        saveParams(epoch_counter, params)

        # if we got the best validation score until now
        if this_validation_loss < best_validation_loss:
            best_validation_loss = this_validation_loss
            best_iter = iter
            best_params = params
        new_learning_rate = decay_learning_rate()
        print("New learning rate:" + str(new_learning_rate))

    end_time = time.clock()
    print('Optimization complete.')
    saveParams(1000, params)
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(sigma=0.01,
                    learning_rate=0.1,
                    n_epochs=200,
                    dataset='mnist.pkl.gz',
                    nkerns=[20, 50],
                    batch_size=500):
    """ Demonstrates lenet on MNIST dataset

    :type sigma: float
    :param sigma: standard deviation in normal distribution

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState(930508)

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
    n_test_batches = test_set_x.get_value(borrow=True).shape[0]
    n_train_batches /= batch_size
    n_valid_batches /= batch_size
    n_test_batches /= batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
    # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size, 28 * 28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    # (28, 28) is the size of MNIST images.

    # Parameterizing
    n_feature = train_set_x.get_value().shape[1]
    matrix_dim = numpy.sqrt(n_feature)
    matrix_dim = matrix_dim.astype('int8')

    layer0_input = x.reshape((batch_size, 1, matrix_dim, matrix_dim))
    # Construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = LeNetConvPoolLayer(rng,
                                input=layer0_input,
                                image_shape=(batch_size, 1, matrix_dim,
                                             matrix_dim),
                                filter_shape=(nkerns[0], 1, 5, 5),
                                poolsize=(2, 2))

    # Construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4)
    temp1 = (matrix_dim - 5 + 1) / 2
    layer1 = LeNetConvPoolLayer(rng,
                                input=layer0.output,
                                image_shape=(batch_size, nkerns[0], temp1,
                                             temp1),
                                filter_shape=(nkerns[1], nkerns[0], 5, 5),
                                poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrices of
    # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    temp2 = (temp1 - 5 + 1) / 2
    ### This is a good place to add noise ###
    srng = RandomStreams(seed=508)
    variation = srng.normal((temp2 * temp2 * 50, ), 0, sigma)
    layer2_input += variation
    ### end ###

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * temp2 * temp2,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    n_out = max(train_set_y.eval()) - min(train_set_y.eval()) + 1
    # print n_out
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=n_out)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size:(index + 1) * batch_size],
            y: valid_set_y[index * batch_size:(index + 1) * batch_size]
        })

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 10000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            # if iter % 10 == 0:
            # print 'training @ iter = ', iter
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                this_validation_loss = numpy.mean(validation_losses)
                '''print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))'''

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in xrange(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    '''print(('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.))'''
                    print test_score * 100.

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    '''print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score *
          100.))a'''
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Ejemplo n.º 9
0
    def __init__(self, numpy_rng=None, theano_rng=None, n_ins=784,
                 hidden_layers_sizes=[500, 500], n_outs=10,
                 corruption_levels=[0.1, 0.1]):
        #save for raw_dump
        self.n_ins = n_ins
        self.hidden_layers_sizes = hidden_layers_sizes
        self.n_outs = n_outs
        self.corruption_levels = corruption_levels

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not numpy_rng:
            numpy_rng = numpy.random.RandomState(numpy.random.randint(2 ** 30))

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
                                 # [int] labels

        for i in xrange(self.n_layers):

            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output


            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            self.sigmoid_layers.append(sigmoid_layer)
            self.params.extend(sigmoid_layer.params)


            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,  #shared weight
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)

        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
                         input=self.sigmoid_layers[-1].output,
                         n_in=hidden_layers_sizes[-1], 
                         n_out=n_outs)

        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
def evaluate_lenet5(learning_rate=0.0001,
                    n_epochs=2000,
                    nkerns=[256, 256],
                    batch_size=1,
                    window_width=[4, 4],
                    maxSentLength=64,
                    emb_size=300,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0006,
                    Div_reg=0.06,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40):
    maxSentLength = max_truncate + 2 * (window_width[0] - 1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/'
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size = load_wikiQA_corpus(
        rootPath + 'vocab.txt', rootPath + 'WikiQA-train.txt',
        rootPath + 'test_filtered.txt', max_truncate,
        maxSentLength)  #vocab_size contain train, dev and test
    #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test
    mtPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/'
    mt_train, mt_test = load_mts_wikiQA(
        mtPath + 'result_train/concate_2mt_train.txt',
        mtPath + 'result_test/concate_2mt_test.txt')
    wm_train, wm_test = load_wmf_wikiQA(
        rootPath + 'train_word_matching_scores.txt',
        rootPath + 'test_word_matching_scores.txt')
    #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt')
    indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[
        0]
    indices_train_l = indices_train[::2, :]
    indices_train_r = indices_train[1::2, :]
    trainLengths_l = trainLengths[::2]
    trainLengths_r = trainLengths[1::2]
    normalized_train_length_l = normalized_train_length[::2]
    normalized_train_length_r = normalized_train_length[1::2]

    trainLeftPad_l = trainLeftPad[::2]
    trainLeftPad_r = trainLeftPad[1::2]
    trainRightPad_l = trainRightPad[::2]
    trainRightPad_r = trainRightPad[1::2]
    indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[
        1]
    indices_test_l = indices_test[::2, :]
    indices_test_r = indices_test[1::2, :]
    testLengths_l = testLengths[::2]
    testLengths_r = testLengths[1::2]
    normalized_test_length_l = normalized_test_length[::2]
    normalized_test_length_r = normalized_test_length[1::2]

    testLeftPad_l = testLeftPad[::2]
    testLeftPad_r = testLeftPad[1::2]
    testRightPad_l = testRightPad[::2]
    testRightPad_r = testRightPad[1::2]

    n_train_batches = indices_train_l.shape[0] / batch_size
    n_test_batches = indices_test_l.shape[0] / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_l = theano.shared(numpy.asarray(indices_train_l,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_train_r = theano.shared(numpy.asarray(indices_train_r,
                                                  dtype=theano.config.floatX),
                                    borrow=True)
    indices_test_l = theano.shared(numpy.asarray(indices_test_l,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_test_r = theano.shared(numpy.asarray(indices_test_r,
                                                 dtype=theano.config.floatX),
                                   borrow=True)
    indices_train_l = T.cast(indices_train_l, 'int64')
    indices_train_r = T.cast(indices_train_r, 'int64')
    indices_test_l = T.cast(indices_test_l, 'int64')
    indices_test_r = T.cast(indices_test_r, 'int64')

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'vocab_embs_300d.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    #cost_tmp=0
    error_sum = 0

    # allocate symbolic variables for the data
    index = T.lscalar()
    x_index_l = T.lmatrix(
        'x_index_l')  # now, x is the index matrix, must be integer
    x_index_r = T.lmatrix('x_index_r')
    y = T.lvector('y')
    left_l = T.lscalar()
    right_l = T.lscalar()
    left_r = T.lscalar()
    right_r = T.lscalar()
    length_l = T.lscalar()
    length_r = T.lscalar()
    norm_length_l = T.dscalar()
    norm_length_r = T.dscalar()
    mts = T.dmatrix()
    wmf = T.dmatrix()
    cost_tmp = T.dscalar()
    #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten()
    ishape = (emb_size, maxSentLength)  # this is the size of MNIST images
    filter_size = (emb_size, window_width[0])
    filter_size_2 = (nkerns[0], window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    length_after_wideConv = ishape[1] + filter_size[1] - 1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # Reshape matrix of rasterized images of shape (batch_size,28*28)
    # to a 4D tensor, compatible with our LeNetConvPoolLayer
    #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1]))
    layer0_l_input = embeddings[x_index_l.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()
    layer0_r_input = embeddings[x_index_r.flatten()].reshape(
        (maxSentLength, emb_size)).transpose()

    l_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_l_input[:, left_l:-right_l]), 'l_input_tensor')
    r_input_tensor = debug_print(
        Matrix_Bit_Shift(layer0_r_input[:, left_r:-right_r]), 'r_input_tensor')

    addition_l = T.sum(layer0_l_input[:, left_l:-right_l], axis=1)
    addition_r = T.sum(layer0_r_input[:, left_r:-right_r], axis=1)
    cosine_addition = cosine(addition_l, addition_r)
    eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r))  #25.2%

    U, W, b = create_GRU_para(rng, emb_size, nkerns[0])
    layer0_para = [U, W, b]

    layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)
    layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor,
                                       hidden_dim=nkerns[0],
                                       U=U,
                                       W=W,
                                       b=b,
                                       bptt_truncate=-1)

    cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)
    eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep,
                                     layer0_A2.output_sent_rep))  #25.2%

    #ibm attentive pooling at extended sentence level
    attention_matrix = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim,
        layer0_A2.dim,
        maxSentLength * (maxSentLength + 1) / 2)
    #     attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose()
    #     ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose()
    #     attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose()
    #     ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose()
    #     cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended)
    #     eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2%

    #ibm attentive pooling at original sentence level
    simi_matrix_sent = compute_simi_feature_matrix_with_matrix(
        layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates,
        length_l, length_r, maxSentLength)
    attention_vec_l = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=1)).transpose()
    ibm_l = layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose()
    attention_vec_r = T.nnet.softmax(T.max(simi_matrix_sent,
                                           axis=0)).transpose()
    ibm_r = layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose()
    cosine_ibm = cosine(ibm_l, ibm_r)
    eucli_ibm = 1.0 / (1.0 + EUCLID(ibm_l, ibm_r))  #25.2%

    l_max_attention = T.max(attention_matrix, axis=1)
    neighborsArgSorted = T.argsort(l_max_attention)
    kNeighborsArg = neighborsArgSorted[-3:]  #only average the max 3 vectors
    ll = T.sort(kNeighborsArg).flatten()  # make y indices in acending lie

    r_max_attention = T.max(attention_matrix, axis=0)
    neighborsArgSorted_r = T.argsort(r_max_attention)
    kNeighborsArg_r = neighborsArgSorted_r[
        -3:]  #only average the max 3 vectors
    rr = T.sort(kNeighborsArg_r).flatten()  # make y indices in acending lie

    l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll],
                                      'l_max_min_attention')
    r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr],
                                      'r_max_min_attention')

    U1, W1, b1 = create_GRU_para(rng, nkerns[0], nkerns[1])
    layer1_para = [U1, W1, b1]

    layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)
    layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention,
                                 word_dim=nkerns[0],
                                 hidden_dim=nkerns[1],
                                 U=U1,
                                 W=W1,
                                 b=b1,
                                 bptt_truncate=-1)

    vec_l = debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])),
                        'vec_l')
    vec_r = debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])),
                        'vec_r')

    #     sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size))
    #     aver_uni_l=sum_uni_l/layer0_l_input.shape[3]
    #     norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum())
    #     sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size))
    #     aver_uni_r=sum_uni_r/layer0_r_input.shape[3]
    #     norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum())
    #
    uni_cosine = cosine(vec_l, vec_r)
    #     aver_uni_cosine=cosine(aver_uni_l, aver_uni_r)
    #     uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi')
    #     '''
    #     linear=Linear(sum_uni_l, sum_uni_r)
    #     poly=Poly(sum_uni_l, sum_uni_r)
    #     sigmoid=Sigmoid(sum_uni_l, sum_uni_r)
    #     rbf=RBF(sum_uni_l, sum_uni_r)
    #     gesd=GESD(sum_uni_l, sum_uni_r)
    #     '''
    eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r))  #25.2%
    #     #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r))
    #
    len_l = norm_length_l.reshape((1, 1))
    len_r = norm_length_r.reshape((1, 1))
    #
    #     '''
    #     len_l=length_l.reshape((1,1))
    #     len_r=length_r.reshape((1,1))
    #     '''
    #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1))
    #length_gap=T.sqrt((len_l-len_r)**2)
    #layer3_input=mts
    layer3_input = T.concatenate(
        [
            vec_l,
            vec_r,
            uni_cosine,
            eucli_1,
            cosine_addition,
            eucli_addition,
            #                                 cosine_sent, eucli_sent,
            ibm_l.reshape((1, nkerns[0])),
            ibm_r.reshape((1, nkerns[0])),  #2*nkerns[0]+
            cosine_ibm,
            eucli_ibm,
            len_l,
            len_r,
            wmf
        ],
        axis=1)  #, layer2.output, layer1.output_cosine], axis=1)
    #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1)
    #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2)
    layer3 = LogisticRegression(rng,
                                input=layer3_input,
                                n_in=(2 * nkerns[1] + 2) + 2 +
                                (2 * nkerns[0] + 2) + 2 + 2,
                                n_out=2)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (layer3.W**2).sum() + (U**2).sum() + (W**2).sum() + (U1**2).sum() +
        (W1**2).sum(), 'L2_reg'
    )  #+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg = Diversify_Reg(layer3.W.T) + Diversify_Reg(
        U[0]) + Diversify_Reg(W[0]) + Diversify_Reg(U1[0]) + Diversify_Reg(
            W1[0]) + Diversify_Reg(U[1]) + Diversify_Reg(W[1]) + Diversify_Reg(
                U1[1]) + Diversify_Reg(W1[1]) + Diversify_Reg(
                    U[2]) + Diversify_Reg(W[2]) + Diversify_Reg(
                        U1[2]) + Diversify_Reg(W1[2])
    cost_this = debug_print(layer3.negative_log_likelihood(y),
                            'cost_this')  #+L2_weight*L2_reg
    cost = debug_print((cost_this + cost_tmp) / update_freq +
                       L2_weight * L2_reg + Div_reg * diversify_reg, 'cost')
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index], [layer3.prop_for_posi, layer3_input, y],
        givens={
            x_index_l: indices_test_l[index:index + batch_size],
            x_index_r: indices_test_r[index:index + batch_size],
            y: testY[index:index + batch_size],
            left_l: testLeftPad_l[index],
            right_l: testRightPad_l[index],
            left_r: testLeftPad_r[index],
            right_r: testRightPad_r[index],
            length_l: testLengths_l[index],
            length_r: testLengths_r[index],
            norm_length_l: normalized_test_length_l[index],
            norm_length_r: normalized_test_length_r[index],
            mts: mt_test[index:index + batch_size],
            wmf: wm_test[index:index + batch_size]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = layer3.params + layer1_para + layer0_para  #+[embeddings]# + layer1.params

    #     params_conv = [conv_W, conv_b]

    #     accumulator=[]
    #     for para_i in params:
    #         eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
    #         accumulator.append(theano.shared(eps_p, borrow=True))
    #
    #     # create a list of gradients for all model parameters
    #     grads = T.grad(cost, params)
    #
    #     updates = []
    #     for param_i, grad_i, acc_i in zip(params, grads, accumulator):
    #         grad_i=debug_print(grad_i,'grad_i')
    #         acc = acc_i + T.sqr(grad_i)
    #         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc)))   #AdaGrad
    #         updates.append((acc_i, acc))

    def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(numpy.float64(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates

    updates = Adam(cost=cost, params=params, lr=learning_rate)

    train_model = theano.function(
        [index, cost_tmp],
        cost,
        updates=updates,
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    train_model_predict = theano.function(
        [index], [cost_this, layer3.errors(y), layer3_input, y],
        givens={
            x_index_l: indices_train_l[index:index + batch_size],
            x_index_r: indices_train_r[index:index + batch_size],
            y: trainY[index:index + batch_size],
            left_l: trainLeftPad_l[index],
            right_l: trainRightPad_l[index],
            left_r: trainLeftPad_r[index],
            right_r: trainRightPad_r[index],
            length_l: trainLengths_l[index],
            length_r: trainLengths_r[index],
            norm_length_l: normalized_train_length_l[index],
            norm_length_r: normalized_train_length_r[index],
            mts: mt_train[index:index + batch_size],
            wmf: wm_train[index:index + batch_size]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.time()

    mid_time = start_time

    epoch = 0
    done_looping = False

    svm_max = 0.0
    best_epoch = 0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0
        #shuffle(train_batch_start)#shuffle training data
        cost_tmp = 0.0
        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #if epoch %2 ==0:
            #    batch_start=batch_start+remain_train
            #time.sleep(0.5)
            #             print batch_start
            if iter % update_freq != 0:
                cost_ij, error_ij, layer3_input, y = train_model_predict(
                    batch_start)
                #print 'layer3_input', layer3_input
                cost_tmp += cost_ij
                error_sum += error_ij
                #print 'cost_acc ',cost_acc
                #print 'cost_ij ', cost_ij
                #print 'cost_tmp before update',cost_tmp
            else:
                cost_average = train_model(batch_start, cost_tmp)
                #print 'layer3_input', layer3_input
                error_sum = 0
                cost_tmp = 0.0  #reset for the next batch
                #print 'cost_average ', cost_average
                #print 'cost_this ',cost_this
                #exit(0)
            #exit(0)
            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + ' average cost: ' + str(
                        cost_average) + ' error: ' + str(
                            error_sum) + '/' + str(
                                update_freq) + ' error rate: ' + str(
                                    error_sum * 1.0 / update_freq)
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_probs = []
                test_y = []
                test_features = []
                for i in test_batch_start:
                    prob_i, layer3_input, y = test_model(i)
                    #test_losses = [test_model(i) for i in test_batch_start]
                    test_probs.append(prob_i[0][0])
                    test_y.append(y[0])
                    test_features.append(layer3_input[0])

                MAP, MRR = compute_map_mrr(rootPath + 'test_filtered.txt',
                                           test_probs)
                #now, check MAP and MRR
                print(
                    ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best '
                     'model %f, MRR  %f') %
                    (epoch, minibatch_index, n_train_batches, MAP, MRR))
                #now, see the results of LR
                #write_feature=open(rootPath+'feature_check.txt', 'w')
                train_y = []
                train_features = []
                count = 0
                for batch_start in train_batch_start:
                    cost_ij, error_ij, layer3_input, y = train_model_predict(
                        batch_start)
                    train_y.append(y[0])
                    train_features.append(layer3_input[0])
                    #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n')
                    #count+=1

                #write_feature.close()

                clf = svm.SVC(C=1.0, kernel='linear')
                clf.fit(train_features, train_y)
                results_svm = clf.decision_function(test_features)
                MAP_svm, MRR_svm = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_svm)

                lr = LinearRegression().fit(train_features, train_y)
                results_lr = lr.predict(test_features)
                MAP_lr, MRR_lr = compute_map_mrr(
                    rootPath + 'test_filtered.txt', results_lr)
                print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr

            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()
    end_time = time.time()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Ejemplo n.º 11
0
def evaluate_lenet5(learning_rate=0.1, n_epochs=4, L2_weight=0.001, emb_size=70, batch_size=50, filter_size=3, maxSentLen=50, nn='CNN'):
    hidden_size=emb_size
    model_options = locals().copy()
    print "model options", model_options
    
    rng = np.random.RandomState(1234)    #random seed, control the model generates the same results 


    all_sentences_l, all_masks_l, all_sentences_r, all_masks_r,all_labels, word2id  =load_SNLI_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_sents_l=np.asarray(all_sentences_l[0], dtype='int32')
    dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32')
    test_sents_l=np.asarray(all_sentences_l[2], dtype='int32')
    
    train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX)
    dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX)
    test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX)
    
    train_sents_r=np.asarray(all_sentences_r[0], dtype='int32')
    dev_sents_r=np.asarray(all_sentences_r[1]    , dtype='int32')
    test_sents_r=np.asarray(all_sentences_r[2] , dtype='int32')
        
    train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX)
    dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX)
    test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX)
            
    train_labels_store=np.asarray(all_labels[0], dtype='int32')
    dev_labels_store=np.asarray(all_labels[1], dtype='int32')
    test_labels_store=np.asarray(all_labels[2], dtype='int32')
        
    train_size=len(train_labels_store)
    dev_size=len(dev_labels_store)
    test_size=len(test_labels_store)
    
    vocab_size=len(word2id)+1
                    
    rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size))   #generate a matrix by Gaussian distribution
    #here, we leave code for loading word2vec to initialize words
#     rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX)
#     id2word = {y:x for x,y in word2id.iteritems()}
#     word2vec=load_word2vec()
#     rand_values=load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True)   #wrap up the python variable "rand_values" into theano variable      
    
    
    #now, start to build the input form of the model
    sents_ids_l=T.imatrix()
    sents_mask_l=T.fmatrix()
    sents_ids_r=T.imatrix()
    sents_mask_r=T.fmatrix()
    labels=T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'    
    
    common_input_l=embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)) #the input format can be adapted into CNN or GRU or LSTM
    common_input_r=embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size))
    
    #conv
    if nn=='CNN':
        conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size))
        conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3]))
        NN_para=[conv_W, conv_b]
         
        conv_input_l = common_input_l.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen)
        conv_model_l = Conv_with_input_para(rng, input=conv_input_l,
                 image_shape=(batch_size, 1, emb_size, maxSentLen),
                 filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b)
        conv_output_l=conv_model_l.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1)    
        conv_output_into_tensor3_l=conv_output_l.reshape((batch_size, hidden_size, maxSentLen-filter_size+1))
        mask_for_conv_output_l=T.repeat(sents_mask_l[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1)
        mask_for_conv_output_l=(1.0-mask_for_conv_output_l)*(mask_for_conv_output_l-10)
        masked_conv_output_l=conv_output_into_tensor3_l+mask_for_conv_output_l      #mutiple mask with the conv_out to set the features by UNK to zero
        sent_embeddings_l=T.max(masked_conv_output_l, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
     
        conv_input_r = common_input_r.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen)
        conv_model_r = Conv_with_input_para(rng, input=conv_input_r,
                 image_shape=(batch_size, 1, emb_size, maxSentLen),
                 filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b)
        conv_output_r=conv_model_r.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1)    
        conv_output_into_tensor3_r=conv_output_r.reshape((batch_size, hidden_size, maxSentLen-filter_size+1))
        mask_for_conv_output_r=T.repeat(sents_mask_r[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1)
        mask_for_conv_output_r=(1.0-mask_for_conv_output_r)*(mask_for_conv_output_r-10)
        masked_conv_output_r=conv_output_into_tensor3_r+mask_for_conv_output_r      #mutiple mask with the conv_out to set the features by UNK to zero
        sent_embeddings_r=T.max(masked_conv_output_r, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size   
    
     
    #GRU
    if nn=='GRU':
        U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size)
        NN_para=[U1, W1, b1]     #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
        gru_input_l = common_input_l.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
        gru_layer_l=GRU_Batch_Tensor_Input_with_Mask(gru_input_l, sents_mask_l,  hidden_size, U1, W1, b1)
        sent_embeddings_l=gru_layer_l.output_sent_rep  # (batch_size, hidden_size)
        gru_input_r = common_input_r.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
        gru_layer_r=GRU_Batch_Tensor_Input_with_Mask(gru_input_r, sents_mask_r,  hidden_size, U1, W1, b1)
        sent_embeddings_r=gru_layer_r.output_sent_rep  # (batch_size, hidden_size)


    #LSTM
    if nn=='LSTM':
        LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size)
        NN_para=LSTM_para_dict.values() # .values returns a list of parameters
        lstm_input_l = common_input_l.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU
        lstm_layer_l=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_l, sents_mask_l,  hidden_size, LSTM_para_dict)
        sent_embeddings_l=lstm_layer_l.output_sent_rep  # (batch_size, hidden_size)   
        lstm_input_r = common_input_r.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU
        lstm_layer_r=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_r, sents_mask_r,  hidden_size, LSTM_para_dict)
        sent_embeddings_r=lstm_layer_r.output_sent_rep  # (batch_size, hidden_size)      
    
    
    HL_layer_1_input = T.concatenate([sent_embeddings_l,sent_embeddings_r, sent_embeddings_l*sent_embeddings_r, cosine_matrix1_matrix2_rowwise(sent_embeddings_l,sent_embeddings_r).dimshuffle(0,'x')],axis=1)
    HL_layer_1_input_size = hidden_size*3+1
    HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size, activation=T.tanh)
    HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size, n_out=hidden_size, activation=T.tanh)

    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    LR_input_size=HL_layer_1_input_size+2*hidden_size
    U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2
    LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class  
    LR_para=[U_a, LR_b]
    
    LR_input=T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1)
    layer_LR=LogisticRegression(rng, input=T.tanh(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    loss=layer_LR.negative_log_likelihood(labels)  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    
    params = [embeddings]+NN_para+LR_para+HL_layer_1.params+HL_layer_2.params   # put all model parameters together
#     L2_reg =L2norm_paraList([embeddings,conv_W, U_a])
#     diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix)

    cost=loss#+Div_reg*diversify_reg#+L2_weight*L2_reg
    
    grads = T.grad(cost, params)    # create a list of gradients for all model parameters
    accumulator=[]
    for para_i in params:
        eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        acc = acc_i + T.sqr(grad_i)
        updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8)))   #1e-8 is add to get rid of zero division
        updates.append((acc_i, acc))    


    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore')
    dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')    
    test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    
    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time= mid_time
    epoch = 0
    done_looping = False

    n_train_batches=train_size/batch_size
    train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size]
    n_dev_batches=dev_size/batch_size
    dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size]
    n_test_batches=test_size/batch_size
    test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size]

        
    max_acc_dev=0.0
    max_acc_test=0.0
    
    while epoch < n_epochs:
        epoch = epoch + 1
        train_indices = range(train_size)
        random.Random(200).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        iter_accu=0
        cost_i=0.0
        for batch_id in train_batch_start: #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu +1
            iter_accu+=1
            train_id_batch = train_indices[batch_id:batch_id+batch_size]
            cost_i+= train_model(
                                train_sents_l[train_id_batch], 
                                train_masks_l[train_id_batch],
                                train_sents_r[train_id_batch], 
                                train_masks_r[train_id_batch],                                
                                train_labels_store[train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            if iter%500==0:
                print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
                past_time = time.time()
#             if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0:
#                 print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min'
#                 past_time = time.time()

                error_sum=0.0
                for dev_batch_id in dev_batch_start: # for each test batch
                    error_i=dev_model(
                                dev_sents_l[dev_batch_id:dev_batch_id+batch_size], 
                                dev_masks_l[dev_batch_id:dev_batch_id+batch_size],
                                dev_sents_r[dev_batch_id:dev_batch_id+batch_size], 
                                dev_masks_r[dev_batch_id:dev_batch_id+batch_size],                                
                                dev_labels_store[dev_batch_id:dev_batch_id+batch_size]
                                )
                    
                    error_sum+=error_i
                dev_accuracy=1.0-error_sum/(len(dev_batch_start))
                if dev_accuracy > max_acc_dev:
                    max_acc_dev=dev_accuracy
                    print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev
                    #best dev model, do test
                    error_sum=0.0
                    for test_batch_id in test_batch_start: # for each test batch
                        error_i=test_model(
                                test_sents_l[test_batch_id:test_batch_id+batch_size], 
                                test_masks_l[test_batch_id:test_batch_id+batch_size],
                                test_sents_r[test_batch_id:test_batch_id+batch_size], 
                                test_masks_r[test_batch_id:test_batch_id+batch_size],                                
                                test_labels_store[test_batch_id:test_batch_id+batch_size]
                                )
                        
                        error_sum+=error_i
                    test_accuracy=1.0-error_sum/(len(test_batch_start))
                    if test_accuracy > max_acc_test:
                        max_acc_test=test_accuracy
                    print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test
                else:
                    print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev

        
        print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min'
        mid_time = time.time()
            
        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
                    
    return max_acc_test                
Ejemplo n.º 12
0
Archivo: cnn.py Proyecto: ec1112/indiv
def trainConvNet(data_xy, inp_dim =10, n_epochs = 3, nkerns=[5, 10], batch_size=500, learning_rate=0.1):
	with open("metrics.txt", "a") as f:
		f.write("**********\n")
		f.write("Learning rate: {0}\n".format(learning_rate))
		train_x, train_y, test_x, test_y, valid_x, valid_y = data_xy

		n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size
		n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size
		n_test_batches = test_x.get_value(borrow=True).shape[0] / batch_size
		print '...building the model'

		kern0_dim = 3
		kern1_dim = 2
		pool0_dim = 2
		pool1_dim = 1



		if inp_dim==20:
			kern0_dim = 3
			kern1_dim = 2
			pool0_dim = 2
			pool1_dim = 1

		if inp_dim==24:
			kern0_dim = 5
			kern1_dim = 3
			pool0_dim = 2
			pool1_dim = 1

		if inp_dim==30:
			kern0_dim = 7
			kern1_dim = 5
			pool0_dim = 2
			pool1_dim = 1
		




		index = T.lscalar()

		x = T.tensor4('x')
		y = T.ivector('y')
		rng = numpy.random.RandomState(23455)

		layer0_input = x.reshape((batch_size, THREE, inp_dim, inp_dim))

		layer0 = LeNetConvPoolLayer(
			rng, 
			input = layer0_input,
			image_shape=(batch_size, THREE, inp_dim, inp_dim),
			filter_shape=(nkerns[0], 3, kern0_dim, kern0_dim),
			poolsize=(pool0_dim, pool0_dim)
		)

		inp1_dim = (inp_dim-kern0_dim+1)/pool0_dim
		layer1 = LeNetConvPoolLayer(
			rng,
			input = layer0.output,
			image_shape=(batch_size, nkerns[0], inp1_dim, inp1_dim),
			filter_shape=(nkerns[1], nkerns[0], kern1_dim, kern1_dim),
			poolsize=(pool1_dim, pool1_dim)
		)

		layer2_input = layer1.output.flatten(2)

		inp2_dim = (inp1_dim-kern1_dim+1)/pool1_dim
		layer2 = HiddenLayer(
			rng,
			input=layer2_input,
			n_in=nkerns[1]*inp2_dim*inp2_dim,
			n_out=300,
			activation=T.tanh
		)

		layer3 = LogisticRegression(input=layer2.output, n_in=300, n_out=10)
		cost = layer3.negative_log_likelihood(y)


		test_model = theano.function([index], layer3.errors(y), givens={
				x: test_x[index*batch_size: (index+1)*batch_size],
				y: test_y[index*batch_size: (index+1)*batch_size]
			})

		validate_model = theano.function([index], layer3.errors(y), givens={
				x: valid_x[index*batch_size: (index+1)*batch_size],
				y: valid_y[index*batch_size: (index+1)*batch_size]
			})

		params = layer3.params + layer2.params + layer1.params + layer0.params

		grads  = T.grad(cost, params)

		updates = [
			(param_i, param_i - learning_rate * grad_i)
			for param_i, grad_i in zip(params, grads)
		] 
	
		train_model = theano.function([index], cost, updates=updates, givens={
				x: train_x[index*batch_size: (index+1)*batch_size],
				y: train_y[index*batch_size: (index+1)*batch_size]
			})

		print 'training... '

		patience = 10000
		patience_increase = 2
		improvement_threshold = 0.995
		validation_frequency = min(n_train_batches, patience / 2)
		best_validation_loss = numpy.inf
		best_iter = 0
		test_score = 0.
		start_time = timeit.default_timer()

		epoch = 0
		done_looping = False

		
		while (epoch < n_epochs) and (not done_looping):
			epoch = epoch + 1
			for minibatch_index in xrange(n_train_batches):
				iter = (epoch - 1) * n_train_batches + minibatch_index
				if iter % 100 == 0:
					print 'training @ iter = ', iter
				cost_ij = train_model(minibatch_index)

				if (iter + 1) % validation_frequency == 0:
					validation_losses = [validate_model(i) for i in xrange(n_valid_batches)]
					this_validation_loss = numpy.mean(validation_losses)
					print('epoch %i, minibatch %i/%i, validation error %f %%\n' %(epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.))
					f.write("Epoch: {0}\n".format(epoch)) 
					f.write("Validation loss: {0}\n".format(this_validation_loss*100))
					f.write("Cost: {0}\n".format(cost_ij))
	                if this_validation_loss < best_validation_loss:

	                    if this_validation_loss < best_validation_loss *  \
	                       improvement_threshold:
	                        patience = max(patience, iter * patience_increase)

	                    best_validation_loss = this_validation_loss
	                    best_iter = iter

	                    test_losses = [
	                        test_model(i)
	                        for i in xrange(n_test_batches)
	                    ]
	                    test_score = numpy.mean(test_losses)
	                    print(('     epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') %(epoch, minibatch_index + 1, n_train_batches, test_score * 100.))
	          	if patience<=iter:
	          		done_looping=True
	          		break

		end_time = timeit.default_timer()
		print('Optimization complete.')
		print('Best validation score of %f %% obtained at iteration %i, '
	          'with test performance %f %%' %
	          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
		print >> sys.stderr, ('The code for file ' +
	                          os.path.split(__file__)[1] +
	                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

		print ('saving params for patch width: %i...' %(inp_dim))
		save_file = open('param'+str(inp_dim)+'.pkl', 'wb')
		W0 = layer0.params[0]; b0 = layer0.params[1]
		W1 = layer1.params[0]; b1 = layer1.params[1]
		cPickle.dump(W0.get_value(borrow=True), save_file, -1)
		cPickle.dump(b0.get_value(borrow=True), save_file, -1)
		cPickle.dump(W1.get_value(borrow=True), save_file, -1)
		cPickle.dump(b1.get_value(borrow=True), save_file, -1)
		save_file.close()
    def __init__(self,
                 batch_size,
                 kernels,
                 input_dimensions,
                 convolution_dimensions,
                 pool_sizes,
                 stride_sizes,
                 layer_pattern,
                 relu_pattern,
                 dropout_rate,
                 rng_seed=None,
                 base_learning_rate=0.05,
                 momentum=0.8,
                 learning_decay_per_epoch=0.91,
                 l2_norm=0,
                 name="default",
                 param_index=0,
                 address='',
                 n_epochs=200,
                 batch_normalization_pattern=None,
                 batch_norm_learning_rate=0.1,
                 batch_norm_decay_per_epoch=0.95,
                 batchnorm_vals_filename=None,
                 batchnorm_slide_percent=0.):
        """
        batch_size - int - size of each batch
        kernels - int array - number of general units each layer (incl. input/output)
        input_dimensions - int array[2] -  dimensions of input
        convolution_dimensions - int array[2] array - dimensions of each convolution
        pool_sizes - int array[2] array - dimensions of pooling for each convolution
        stride_sizes - int array - length of strides for each convolutional layer (this overrides aspects of pooling behavior)
        layer_pattern - ['I','C',...,'C','F',...,'F','O'] - indicates pattern of layers
        relu_pattern - boolean array that describes if convolutional layers should be rectified; doesn't do anything for other types of layers (including input)
        dropout_rate - float - rate of dropout for network weights
        rng_seed - int - seed for random number generator; None defaults to random
        base_learning_rate - floatX - initial learning rate
        momentum - floatX - amount that learning rate carries over through iterations
        learning_decay_per_epoch - floatX - factor for decreasing learning rate over epochs
        name - string that describes the beginning of the filenames of the network pickle
        param_index - integer determined a priori to index the param configurations and show it in the filename
        batchnorm_vals_filename - has to be constructed by separate file; pre-defines mean and sd of each layer for a nn...might be preferred to use sliding instead, as 
        batchnorm_slide_percent - sort of like momentum, but for calculations of batch-normalization means and standard deviations
        """
        #initialize arrays containing basic information and hyperparameters

        self.layers = []
        self.uses_batch_normalization = bool(batch_normalization_pattern)
        self.batch_norm_pattern = batch_normalization_pattern
        self.batchnorm_vals_filename = batchnorm_vals_filename
        self.batchnorm_slide_percent = batchnorm_slide_percent
        if not self.uses_batch_normalization:
            self.batch_norm_pattern = [False for _ in relu_pattern]
        self.address = address
        #replace future instances of self.kernel
        self.kernels = kernels
        self.input_dimensions = input_dimensions
        self.output_size = kernels[-1:][0]
        self.inputs = []
        self.batch_size = batch_size
        self.x = x = T.ftensor4('x')
        self.y = y = T.ivector('y')
        self.rng = np.random.RandomState(rng_seed)
        self.name = name
        self.n_epochs = n_epochs
        self.shapes = [(input_dimensions[0], input_dimensions[1])]
        print "input shape: " + str(self.shapes)
        self.convolution_dimensions = convolution_dimensions
        self.rng_seed = rng_seed
        self.layer_pattern = layer_pattern
        self.current_batch_index = 0
        self.batch_size = batch_size
        self.pool_sizes = pool_sizes
        self.stride_sizes = stride_sizes
        self.relu_pattern = relu_pattern
        #if the rate is a float, each layer has the same rate
        if type(dropout_rate) == type(1.1):
            dropout_rate = [dropout_rate for _ in layer_pattern]
        self.dropout_rate = dropout_rate

        self.learning_decay_per_epoch = learning_decay_per_epoch
        self.l2_norm = l2_norm
        #get some info from prepare_image_data.py
        #files_list, outputs, y_dim = prepare_image_data.get_data()
        #self.files_list = files_list
        #self.y_dim = y_dim
        #self.outputs=outputs
        self.fetcher = prepare_image_data.fetcher(self.batch_size)
        #indexing information
        self.ratios = np.asarray([0.6, 0.2, 0.2])
        self.index = index = T.lscalar()
        #temporarily hardcoded
        self.n_train_batches = 400
        self.n_valid_batches = 120
        self.n_test_batches = 120
        self.cat_labels = self.fetcher.valid_names
        self.y_dim = len(self.cat_labels)
        self.momentum = theano.shared(np.float32(momentum))
        self.base_learning_rate = np.float32(base_learning_rate)
        self.learning_rate = theano.shared(
            np.float32(base_learning_rate * (1 - momentum)))
        self.index = index = T.lscalar()
        self.momentum_raw = momentum
        self.learning_rate_raw = self.learning_rate.get_value()
        if self.uses_batch_normalization:
            self.batch_norm_learning_rate_raw = batch_norm_learning_rate
            self.batch_norm_learning_rate = theano.shared(
                np.float32(self.batch_norm_learning_rate_raw))
        self.epoch = 0
        #initialize basic file shapes
        #recent change: changed kernel_sizes to self.kernels
        self.training_x = theano.shared(np.zeros(
            shape=(batch_size, self.kernels[0], input_dimensions[0],
                   input_dimensions[1]),
            dtype=theano.config.floatX),
                                        borrow=True)
        self.input = self.x.reshape((self.batch_size, self.kernels[0],
                                     self.shapes[0][0], self.shapes[0][1]))
        #updated database-based retrieval
        self.training_y = theano.shared(np.zeros(shape=self.batch_size,
                                                 dtype=np.int32),
                                        borrow=True)
        self.testing_x = theano.shared(np.zeros(
            shape=(self.batch_size, kernels[0], input_dimensions[0],
                   input_dimensions[1]),
            dtype=theano.config.floatX),
                                       borrow=True)
        self.testing_y = theano.shared(np.zeros(shape=self.batch_size,
                                                dtype=np.int32),
                                       borrow=True)
        self.validation_x = theano.shared(np.zeros(
            shape=(self.batch_size, kernels[0], input_dimensions[0],
                   input_dimensions[1]),
            dtype=theano.config.floatX),
                                          borrow=True)
        self.validation_y = theano.shared(np.zeros(shape=self.batch_size,
                                                   dtype=np.int32),
                                          borrow=True)
        #load fixed mean and sd values if file exists
        if self.batchnorm_vals_filename <> None:
            self.batchnorm_fixed_values = pickle.load(
                self.batchnorm_vals_filename)
        else:
            self.batchnorm_fixed_values = [
                None for _ in range(len(layer_pattern))
            ]
        ###begin creation of layers
        #I = "input";C = "Convolutional"; F = "Fully-Connected", O = "Output"
        for i, pattern in enumerate(layer_pattern):
            if pattern == "I":
                self.inputs.append(self.input)
                print 'inserted input'
            elif pattern == "C":

                self.layers.append(
                    NetConvPoolLayer(
                        self.rng,
                        input = self.inputs[i-1],
                        image_shape=(
                            batch_size,kernels[i-1],
                            self.shapes[i-1][0],
                            self.shapes[i-1][1]
                            ),
                        filter_shape=(
                            kernels[i],
                            kernels[i-1],
                            self.convolution_dimensions[i-1][0],
                            self.convolution_dimensions[i-1][1]),
                        poolsize = pool_sizes[i-1],
                        stride = stride_sizes[i-1],
                        dropout_percent = self.dropout_rate[i],
                        batch_norm = self.batch_norm_pattern[i],
                        batchnorm_slide_percent = self.batchnorm_slide_percent,
                        precalculated_batchnorm_values = self.\
                            batchnorm_fixed_values[i-1])
                    )
                x_new = (
                    self.shapes[i-1][0] - self.convolution_dimensions[i-1][0] + \
                        1 - (pool_sizes[i-1][0] - stride_sizes[i-1][0]))/\
                        (stride_sizes[i-1][0]
                         )
                y_new = (
                    self.shapes[i-1][1] - self.convolution_dimensions[i-1][1] + 1 -\
                        (pool_sizes[i-1][1] - stride_sizes[i-1][1]))/\
                        (stride_sizes[i-1][1]
                         )
                self.inputs.append(self.layers[i - 1].output)
                self.shapes.append((x_new, y_new))
                print "self.shapes: " + str(self.shapes)
                print 'added convolution layer'
            elif pattern == "F":
                if layer_pattern[i - 1] == "C":
                    next_input = self.inputs[i - 1].flatten(2)
                else:
                    next_input = self.inputs[i - 1]
                self.layers.append(
                    HiddenLayer(self.rng,
                                input=next_input,
                                n_in=kernels[i - 1] * self.shapes[i - 1][0] *
                                self.shapes[i - 1][1],
                                n_out=kernels[i],
                                activation=T.tanh,
                                dropout_rate=self.dropout_rate[i]))
                self.inputs.append(self.layers[i - 1].output)
                #the shape is only used to determine dimensions of the next layer
                self.shapes.append((1, 1))  #see if this fixes issue
                print 'added fully-connected hidden layer, shape=%s' %\
                    str(self.shapes[-1])
            else:
                if layer_pattern[i - 1] == "C":
                    next_input = self.inputs[i - 1].flatten(2)
                else:
                    next_input = self.inputs[i - 1]
                self.layers.append(
                    LogisticRegression(input=next_input,
                                       n_in=kernels[i - 1],
                                       n_out=self.output_size,
                                       rng=self.rng,
                                       dropout_rate=self.dropout_rate[i]))
                last_index = i - 1
                print 'added logistic layer'
        zero = np.float32(0.)
        self.L2_penalty = theano.shared(np.float32(l2_norm))
        self.params = params  = [param for layer in self.layers \
                                     for param in layer.params]
        self.cost = self.layers[last_index].negative_log_likelihood(self.y) +\
            self.L2_penalty * (
        T.sum([T.sum(self.layers[q].W * self.layers[q].W)\
                   for q in range(len(self.layers))]))
        #updating functions (incl. momentum)
        #update 1 (only used for derivation in update #4)
        self.old_updates = [theano.shared(zero * param_i.get_value())\
                                for param_i in params]
        self.current_delta = [theano.shared(np.float32(zero * param_i.get_value()))\
                                  for param_i in params]
        self.grads = T.grad(self.cost, params)
        #update 2
        self.current_change_update = [
            (current_delta_i, self.learning_rate * grad_i +\
                 self.momentum * old_updates_i)\
                for current_delta_i,grad_i, old_updates_i in\
                zip(self.current_delta,self.grads,self.old_updates)
            ]
        #update 3
        updates = [
            ( param_i,param_i - current_delta_i) for param_i, current_delta_i in\
                zip(params,self.current_delta)]
        #self.updates = []
        #update 4 (derived from update #1)
        momentum_updates = [(old_updates_i, current_delta_i)\
                                for old_updates_i, current_delta_i in\
                                zip(self.old_updates,self.current_delta)]
        #self.momentum_updates = []
        #now batch-normalization updates when needed
        batchnorm_sliding_updates = []
        for layer in self.layers:
            if not isinstance(layer, NetConvPoolLayer):
                continue
            if layer.batchnorm_slide_percent <> 0.:
                batchnorm_sliding_updates += [
                    (layer.sd_input_old, layer.sd_input),
                    (layer.means_old, layer.sd_input)
                ]
        #combined updates
        self.all_updates = self.current_change_update + updates +\
            momentum_updates + batchnorm_sliding_updates
        #test model function
        self.test_model = theano.function([],
                                          self.layers[last_index].errors(
                                              self.y),
                                          givens={
                                              x: self.testing_x,
                                              y: self.testing_y
                                          })
        #validation model function
        self.validate_model = theano.function([],
                                              self.layers[last_index].errors(
                                                  self.y),
                                              givens={
                                                  x: self.validation_x,
                                                  y: self.validation_y
                                              })
        #training function
        self.train_model = theano.function([],
                                           self.cost,
                                           updates=self.all_updates,
                                           givens={
                                               x: self.training_x,
                                               y: self.training_y
                                           })
        self.patience = 20000
        self.patience_increase = 3
        self.improvement_threshold = 0.995
        self.validation_frequency = min(self.n_train_batches,
                                        self.patience // 2)
        self.best_validation_loss = np.inf
        self.best_iter = 0
        #DEPRECATED
        self.itermode = 'train'
        self.test_score = 0.
        self.start_time = timeit.default_timer()
        self.epoch = 0
        self.iter_i = 0  # renamed bc `iter` is reserved
        self.done_looping = False
        self.param_index = param_index
        #constant-defined stuff
        self.improvement_threshold = 0.995
        self.validation_frequency = min(self.n_train_batches,
                                        self.patience // 2)
        self.done_looping = False
        print 'initialized neural network object'
Ejemplo n.º 14
0
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"):
    print "mode: ", mode
    print "data_name: ", data_name
    print "pooling_mode: ", pooling_mode
    print "Started!"
    rng = numpy.random.RandomState(23455)
    docSentenceCount = T.ivector("docSentenceCount")
    sentenceWordCount = T.ivector("sentenceWordCount")
    corpus = T.matrix("corpus")
    corpusPos = T.matrix("corpusPos")
    docLabel = T.ivector('docLabel')

    corpus0 = T.concatenate([corpus, corpusPos], axis=1)

    # for list-type data
    layer0 = DocEmbeddingNN(corpus0, docSentenceCount, sentenceWordCount, rng, \
                wordEmbeddingDim=249, \
                 sentenceLayerNodesNum=50, \
                 sentenceLayerNodesSize=[5, 249], \
                 docLayerNodesNum=10, \
                 docLayerNodesSize=[3, 50],
                 pooling_mode=pooling_mode)

    layer1 = HiddenLayer(rng,
                         input=layer0.output,
                         n_in=layer0.outputDimension,
                         n_out=10,
                         activation=T.tanh)

    layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=2)

    # construct the parameter array.
    params = layer2.params + layer1.params + layer0.params

    # Load the parameters last time, optionally.

    # 	data_name = "car"

    para_path = "data/" + data_name + "/model/multi_input_mergeinput" + pooling_mode + ".model"
    traintext = "data/" + data_name + "/train/text"
    trainlabel = "data/" + data_name + "/train/label"
    testtext = "data/" + test_dataname + "/test/text"
    testlabel = "data/" + test_dataname + "/test/label"

    loadParamsVal(para_path, params)

    if (mode == "train" or mode == "test"):
        learning_rate = 0.1
        error = layer2.errors(docLabel)
        cost = layer2.negative_log_likelihood(docLabel)

        grads = T.grad(cost, params)

        updates = [(param_i, param_i - learning_rate * grad_i)
                   for param_i, grad_i in zip(params, grads)]

        print "Loading test data."
        cr_test = CorpusReader(minDocSentenceNum=5,
                               minSentenceWordNum=5,
                               dataset=testtext,
                               labelset=testlabel)
        validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, validPosList = cr_test.getCorpus(
            [0, 1000])

        # 		print "Right answer: "
        # 		print zip(validIds, validLabels)

        validDocMatrixes = transToTensor(validDocMatrixes,
                                         theano.config.floatX)
        validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32)
        validSentenceWordNums = transToTensor(validSentenceWordNums,
                                              numpy.int32)
        validLabels = transToTensor(validLabels, numpy.int32)
        validPosList = transToTensor(validPosList, theano.config.floatX)
        print "Data loaded."

        valid_model = theano.function(
            [], [
                cost, error, layer2.y_pred, docLabel,
                T.transpose(layer2.p_y_given_x)[1]
            ],
            givens={
                corpus: validDocMatrixes,
                corpusPos: validPosList,
                docSentenceCount: validDocSentenceNums,
                sentenceWordCount: validSentenceWordNums,
                docLabel: validLabels
            })

        # ####Validate the model####
        costNum, errorNum, pred_label, real_label, pred_prob = valid_model()
        print "Valid current model:"
        print "Cost: ", costNum
        print "Error: ", errorNum
        # 		print "Valid Pred: ", pred_label
        # 		print "pred_prob: ", pred_prob

        fpr, tpr, _ = roc_curve(real_label, pred_prob)
        if mode == "test":
            print "tpr_all: ", tpr
            print "fpr_all: ", fpr
        roc_auc = auc(fpr, tpr)
        print "data_name: ", data_name
        print "test_dataname: ", test_dataname
        print "ROC: ", roc_auc

        fpr, tpr, threshold = roc_curve(real_label, pred_label)

        index_of_one = list(threshold).index(1)
        ar = (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
        print "TPR: ", tpr[index_of_one]
        print "FPR: ", fpr[index_of_one]
        print "AR: ", ar
        print "threshold: ", threshold[index_of_one]
        if mode == "test":
            valid_model.free()
            return errorNum, roc_auc, tpr[index_of_one], fpr[index_of_one], ar

        print "Loading train data."
        cr_train = CorpusReader(minDocSentenceNum=5,
                                minSentenceWordNum=5,
                                dataset=traintext,
                                labelset=trainlabel)
        docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, posList = cr_train.getCorpus(
            [0, 100000])

        # 		print "Right answer: "
        # 		print zip(ids, labels)

        docMatrixes = transToTensor(docMatrixes, theano.config.floatX)
        docSentenceNums = transToTensor(docSentenceNums, numpy.int32)
        sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32)
        labels = transToTensor(labels, numpy.int32)
        posList = transToTensor(posList, theano.config.floatX)

        # 	valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt")
        print
        index = T.lscalar("index")
        batchSize = 10
        n_batches = (len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1
        print
        print "Train set size is ", len(docMatrixes.get_value())
        print "Validating set size is ", len(validDocMatrixes.get_value())
        print "Batch size is ", batchSize
        print "Number of training batches  is ", n_batches

        print "Compiling computing graph."

        # for list-type data
        train_model = theano.function(
            [index], [cost, error, layer2.y_pred, docLabel],
            updates=updates,
            givens={
                corpus:
                docMatrixes,
                corpusPos:
                posList,
                docSentenceCount:
                docSentenceNums[index * batchSize:(index + 1) * batchSize + 1],
                sentenceWordCount:
                sentenceWordNums,
                docLabel:
                labels[index * batchSize:(index + 1) * batchSize],
            })

        print "Compiled."
        print "Start to train."
        epoch = 0
        n_epochs = 10
        ite = 0

        while (epoch < n_epochs):
            epoch = epoch + 1
            #######################
            for i in range(n_batches):
                # for list-type data
                print ".",
                costNum, errorNum, pred_label, real_label = train_model(i)
                print ".",
                ite = ite + 1
                # for padding data
                # 			costNum, errorNum = train_model(docMatrixes, labels)
                # 			del docMatrixes, docSentenceNums, sentenceWordNums, labels
                # print ".",
                if (ite % 10 == 0):
                    print
                    print "@iter: ", ite
                    print "Cost: ", costNum
                    print "Error: ", errorNum

            # Validate the model
            costNum, errorNum, pred_label, real_label, pred_prob = valid_model(
            )
            print "Valid current model:"
            print "Cost: ", costNum
            print "Error: ", errorNum
            # 			print "pred_prob: ", pred_prob
            # 			print "Valid Pred: ", pred_label

            fpr, tpr, _ = roc_curve(real_label, pred_prob)
            roc_auc = auc(fpr, tpr)
            print "data_name: ", data_name
            print "test_dataname: ", test_dataname
            print "ROC: ", roc_auc

            fpr, tpr, threshold = roc_curve(real_label, pred_label)
            index_of_one = list(threshold).index(1)
            print "TPR: ", tpr[index_of_one]
            print "FPR: ", fpr[index_of_one]
            print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2
            print "threshold: ", threshold[index_of_one]
            # Save model
            print "Saving parameters."
            saveParamsVal(para_path, params)
            print "Saved."
        valid_model.free()
        train_model.free()
    elif (mode == "deploy"):
        print "Compiling computing graph."
        output_model = theano.function(
            [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred])
        print "Compiled."
        cr = CorpusReader(minDocSentenceNum=5,
                          minSentenceWordNum=5,
                          dataset="data/train_valid/split")
        count = 21000
        while (count <= 21000):
            docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus(
                [count, count + 100])
            docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX)
            docSentenceNums = numpy.array(docSentenceNums, dtype=numpy.int32)
            sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32)
            print "start to predict."
            pred_y = output_model(docMatrixes, docSentenceNums,
                                  sentenceWordNums)
            print "End predicting."
            print "Writing resfile."
            # 		print zip(ids, pred_y[0])
            f = file("data/test/res/res" + str(count), "w")
            f.write(str(zip(ids, pred_y[0])))
            f.close()
            print "Written." + str(count)
            count += 100
Ejemplo n.º 15
0
def evaluate_lenet5(learning_rate=0.12, n_epochs=200,
                    nkerns=[20, 30 ,20,50,20 ], batch_size=20):
    """ Demonstrates lenet on MNIST dataset

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training /testing (MNIST here)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = numpy.random.RandomState()

    datasets = load_data()

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = int(train_set_x.get_value(borrow=True).shape[0] / batch_size)
    n_valid_batches = int(valid_set_x.get_value(borrow=True).shape[0] / batch_size)
    n_test_batches = int(test_set_x.get_value(borrow=True).shape[0] / batch_size)


    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch

    # start-snippet-1
    x = T.matrix('x')   # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                        # [int] labels

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print('... building the model')
    layer0_input = x.reshape((batch_size, 1, 64, 64))
    layer0 = ConvLayer(
        rng,
        input=layer0_input,
        image_shape=(batch_size, 1, 64, 64),
        filter_shape=(nkerns[0], 1, 1, 1),
    )
    layer0b = LeNetConvPoolLayer(
        rng,
        input=layer0.output,
        image_shape=(batch_size, nkerns[0], 64, 64),
        filter_shape=(nkerns[1], nkerns[0], 5, 5),
        poolsize=(2, 2)
    )

    layer1 = ConvLayer(
        rng,
        input=layer0b.output,
        image_shape=(batch_size, nkerns[1], 30, 30),
        filter_shape=(nkerns[2], nkerns[1], 1, 1),
    )

    layer1b = LeNetConvPoolLayer(
        rng,
        input=layer1.output,
        image_shape=(batch_size, nkerns[2], 30, 30),
        filter_shape=(nkerns[3], nkerns[2], 3, 3),
        poolsize=(2, 2)
    )

    layer1c = ConvLayer(
        rng,
        input=layer1b.output,
        image_shape=(batch_size, nkerns[3], 14, 14),
        filter_shape=(nkerns[4], nkerns[3], 1, 1),
    )

    layer1d = LeNetConvPoolLayer(
        rng,
        input=layer1c.output,
        image_shape=(batch_size, nkerns[4], 14, 14),
        filter_shape=(40, nkerns[4], 3, 3),
        poolsize=(2, 2)
    )
    # construct a fully-connected sigmoidal layer
    layer2 = AveragePoolLayer(
        input=layer1d.output,
        poolsize=(6,6)
    )
    layer3_input = theano.tensor.flatten(layer2.output, outdim=2)#layer2.output.flatten(outdim=2)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer3_input, n_in=40, n_out=40)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    test_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: test_set_x[index * batch_size: (index + 1) * batch_size],
            y: test_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    validate_model = theano.function(
        [index],
        layer3.errors(y),
        givens={
            x: valid_set_x[index * batch_size: (index + 1) * batch_size],
            y: valid_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer1d.params + layer1c.params + layer1b.params + layer1.params + layer0b.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD Since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter. We thus
    # create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    train_model = theano.function(
        [index],
        cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size: (index + 1) * batch_size],
            y: train_set_y[index * batch_size: (index + 1) * batch_size]
        }
    )
    # end-snippet-1

    ###############
    # TRAIN MODEL #
    ###############
    print('... training')
    # early-stopping parameters
    patience = 2000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            iter = (epoch - 1) * n_train_batches + minibatch_index

            if iter % 100 == 0:
                learning_rate*=0.8
            if iter % 100 == 0:
                print('training @ iter = ', iter)
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:

                # compute zero-one loss on validation set
                validation_losses = [validate_model(i) for i
                                     in range(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)
                print(('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.)))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i)
                        for i in range(n_test_batches)
                    ]
                    test_score = numpy.mean(test_losses)
                    print((('     epoch %i, minibatch %i/%i, test error of '
                           'best model %f %%') %
                          (epoch, minibatch_index + 1, n_train_batches,
                           test_score * 100.)))
                    best_score = (this_validation_loss + test_score)/2
                    #with open('inception.pkl', 'wb') as f:
                        #pickle.dump([layer0,layer0b,layer1,layer1b,layer2,layer3], f)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print('Optimization complete.')
    print(('Best validation score of %f %% obtained at iteration %i, '
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.)))
    print(('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
    return best_score
Ejemplo n.º 16
0
    def __init__(self,
                 numpy_rng,
                 theano_rng=None,
                 n_ins=784,
                 hidden_layers_sizes=[500, 500],
                 n_outs=10):
        """
        该类可实现可变层数的DBN

        :param numpy_rng: numpy.random.RandomState  用于初始化权重的numpy随机数
        :param theano_rng: theano.tensor.shared_randomstreams.RandomStreams
                            如果输入为None
        :param n_ins: int DBN输入量的维度
        :param hidden_layers_size: list 隐层输入量的维度
        :param n_outs: int 网络输出量的维度
        :return:
        """
        self.sigmoid_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)
        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2**30))
        #设置符号变量
        self.x = T.matrix('x')
        self.y = T.ivector('y')

        #DBN是一个MLP,中间层的权重是在不同的RBM之间共享的。
        #首先构造DBN为一个深层多感知器。在构造每个sigmoid层时,
        #同样构造RBM与之共享变量。在预训练阶段,需要训练三个RBM(同样改变MLP的权重,
        #微调阶段,通过在MLP上随机梯度下降法完成DBN训练。

        for i in xrange(self.n_layers):
            #构造sigmoid层,
            #对于第一层,输入量大小是网络的输入量大小
            #对于其它层,输入量大小是下层隐层单元的数量
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]
            #对于第一层,输入是网络的输入
            #对于其它层,输入是下层隐层的激活函数值
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[i - 1].output
            #定义sigmoid函数
            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            self.sigmoid_layers.append(sigmoid_layer)

            #sigmoid_layers的参数是DBN的参数。而RBM中可见层的偏置只是RBM的参数,而不属于DBN
            self.params.extend(sigmoid_layer.params)

            #构造RBM共享权重
            rbm_layer = RBM(input=layer_input,
                            n_visible=input_size,
                            n_hidden=hidden_layers_sizes[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b,
                            numpy_rng=numpy_rng,
                            theano_rng=theano_rng)

            self.rbm_layers.append(rbm_layer)

        #添加logistic到网络的顶部
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs)
        self.params.extend(self.logLayer.params)

        #计算微调阶段的代价函数,定义为logistic回归(输出)层的负对数似然函数
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)

        #给定self.x和self.y,计算每个minibatch的误差
        self.errors = self.logLayer.errors(self.y)
Ejemplo n.º 17
0
    def __init__(self, rng, input, layer_sizes, use_bias=True, rectifier=None):

        if rectifier == 'soft':
            rectified_linear_activation = lambda x: T.nnet.softplus(x)
        elif rectifier == 'hard':
            rectified_linear_activation = lambda x: T.maximum(0.0, x)

        # Set up all the hidden layers
        weight_matrix_sizes = zip(layer_sizes, layer_sizes[1:])
        self.layers = []
        self.dropout_layers = []
        next_layer_input = input
        # dropout the input with prob 0.2
        next_dropout_layer_input = _dropout_from_layer(rng, input, p=0.2)
        for n_in, n_out in weight_matrix_sizes[:-1]:
            next_dropout_layer = DropoutHiddenLayer(
                rng=rng,
                input=next_dropout_layer_input,
                activation=rectified_linear_activation,
                n_in=n_in,
                n_out=n_out,
                use_bias=use_bias)
            self.dropout_layers.append(next_dropout_layer)
            next_dropout_layer_input = next_dropout_layer.output

            # Reuse the paramters from the dropout layer here, in a different
            # path through the graph.
            next_layer = HiddenLayer(rng=rng,
                                     input=next_layer_input,
                                     activation=rectified_linear_activation,
                                     W=next_dropout_layer.W * 0.5,
                                     b=next_dropout_layer.b,
                                     n_in=n_in,
                                     n_out=n_out,
                                     use_bias=use_bias)
            self.layers.append(next_layer)
            next_layer_input = next_layer.output

        # Set up the output layer
        n_in, n_out = weight_matrix_sizes[-1]
        dropout_output_layer = LogisticRegression(
            input=next_dropout_layer_input, n_in=n_in, n_out=n_out)
        self.dropout_layers.append(dropout_output_layer)

        # Again, reuse paramters in the dropout output.
        output_layer = LogisticRegression(input=next_layer_input,
                                          W=dropout_output_layer.W * 0.5,
                                          b=dropout_output_layer.b,
                                          n_in=n_in,
                                          n_out=n_out)
        self.layers.append(output_layer)

        # Use the negative log likelihood of the logistic regression layer as
        # the objective.
        self.dropout_negative_log_likelihood = self.dropout_layers[
            -1].negative_log_likelihood
        self.dropout_errors = self.dropout_layers[-1].errors

        self.negative_log_likelihood = self.layers[-1].negative_log_likelihood
        self.errors = self.layers[-1].errors

        # Grab all the parameters together.
        self.params = [
            param for layer in self.dropout_layers for param in layer.params
        ]
Ejemplo n.º 18
0
    def __init__(self, rng, n_in, n_hidden, n_out, x=None, y=None, activation=T.tanh,
                 lambda_reg=0.001, alpha_reg=0.0):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie
        
        :type lambda_reg: float
        :param lambda_reg: paramter to control the sparsity of weights by l_1 norm.
        The regularization term is lambda_reg( (1-alpha_reg)/2 * \sum||W||_2^2 + alpha_reg \sum||W||_1 ).
        Thus, the larger lambda_reg is, the sparser the weights are.
        
        :type alpha_reg: float
        :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm.
        The regularization term is lambda_reg( (1-alpha_reg)/2 * \sum||W||_2^2 + alpha_reg \sum||W||_1 ),
        Thus, the smaller alpha_reg is, the smoother the weights are.
        """
        self.hidden_layers=[]
        self.params=[]
        self.n_layers=len(n_hidden)
        if not x:
            x=T.matrix('x')
        self.x=x
        if not y:
            y=T.ivector('y')
        self.y=y
        for i in range(len(n_hidden)):
            if i==0: # first hidden layer
                hd=HiddenLayer(rng=rng, input=self.x, n_in=n_in, n_out=n_hidden[i],
                               activation=activation)
            else:
                hd=HiddenLayer(rng=rng, input=self.hidden_layers[i-1].output, n_in=n_hidden[i-1], n_out=n_hidden[i],
                               activation=activation)
            self.hidden_layers.append(hd)
            self.params.extend(hd.params)
            
        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        if self.n_layers>0:
            self.logRegressionLayer = LogisticRegression(input=self.hidden_layers[-1].output,
                                                     n_in=n_hidden[-1], n_out=n_out)
        else:
            self.logRegressionLayer = LogisticRegression(input=self.x,
                                                     n_in=n_in, n_out=n_out)

            
        self.params.extend(self.logRegressionLayer.params)
        
        # regularization terms
        L1s=[]
        L2_sqrs=[]
        #L1s.append(abs(self.hidden_layers[0].W).sum())
        for i in range(len(n_hidden)):
            L1s.append (abs(self.hidden_layers[i].W).sum())
            L2_sqrs.append((self.hidden_layers[i].W ** 2).sum())
        L1s.append(abs(self.logRegressionLayer.W).sum())
        L2_sqrs.append((self.logRegressionLayer.W ** 2).sum())
        self.L1 = T.sum(L1s)
        self.L2_sqr = T.sum(L2_sqrs)

        # negative log likelihood of the MLP is given by the negative
        # log likelihood of the output of the model, computed in the
        # logistic regression layer
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors(self.y)
        # cost function to be minimized
        self.cost =  self.negative_log_likelihood(self.y) \
         + lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr +  alpha_reg*self.L1)
        self.y_pred=self.logRegressionLayer.y_pred
Ejemplo n.º 19
0
def sgd_optimization_mnist(tr_start_index=1, tr_limit=5000, vl_start_index=1, vl_limit=5000,
                           learning_rate=0.015, n_epochs=5000
                           , output_filename="ls.out"):

    output_file = open(output_filename,'w')
    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images
    y = T.ivector('y')  # the labels are presented as 1D vector of
                               # [int] labels
    in_shape = layer0_input_shape[0] * layer0_input_shape[1]

    batch_size = tr_limit
    train_set = tdtf.read_data_patch_to_ndarray(train_dataset_route, tr_start_index, tr_limit)
    datasets = load_data.shared_dataset(train_set)
    train_set_x, train_set_y = datasets

    valid_set = tdtf.read_data_patch_to_ndarray(valid_dataset_route, vl_start_index, vl_limit)
    print valid_set[1]
    datasets = load_data.shared_dataset(valid_set)
    valid_set_x, valid_set_y = datasets

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    #n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    if not if_load_trained_model :
        trained_model_pkl = open(train_model_route, 'r')
        trained_model_state_list = cPickle.load(trained_model_pkl)
        trained_model_state_array = numpy.load(trained_model_pkl)
        classifier_state = trained_model_state_array[0]

        classifier = LogisticRegression(input=x, n_in=in_shape, n_out=layer0_output_shape
                                        , W=classifier_state[0], b=classifier_state[1])

    else:

        ######################
        # BUILD ACTUAL MODEL #
        ######################
        #print '... building the model'

        # construct the logistic regression class
        rng = numpy.random.RandomState(23555)
        W_bound=1
        tmp_W = theano.shared(numpy.asarray(
                rng.uniform(low=0, high=W_bound, size=(in_shape, layer0_output_shape)), dtype=theano.config.floatX),
                borrow=True)
        classifier = LogisticRegression(input=x, n_in=in_shape, n_out=layer0_output_shape)
                                    #,W=tmp_W)

    # the cost we minimize during training is the negative log likelihood of
    # the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made by
    # the model on a minibatch
    validate_model = theano.function(inputs=[index],
            outputs=classifier.errors(y),
            givens={
                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]})

    # compute the gradient of cost with respect to theta = (W,b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [(classifier.W, classifier.W - learning_rate * g_W),
               (classifier.b, classifier.b - learning_rate * g_b)]

    # compiling a Theano function `train_model` that returns the cost, but in
    # the same time updates the parameter of the model based on the rules
    # defined in `updates`
    train_model = theano.function(inputs=[index], \
            outputs=[cost, classifier.errors(y)], \
            updates=updates, \
            givens={ \
                x: train_set_x[index * batch_size:(index + 1) * batch_size], \
                y: train_set_y[index * batch_size:(index + 1) * batch_size]})

    ###############
    # TRAIN MODEL #
    ###############
    #print '... training the model'
    # early-stopping parameters
    patience = 50000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                                  # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                  # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()
    best_train_loss = numpy.inf

    done_looping = False
    epoch = 0

    last_train_err = 1
    last_train_cost = 1

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_train_batches):

            minibatch_avg_cost, train_err = train_model(minibatch_index)
            decreasing_rate = (last_train_err - train_err) / (last_train_err) * 100.
            last_train_err = train_err
            c_d_rate = (last_train_cost - minibatch_avg_cost) / (last_train_cost) * 100.
            last_train_cost = minibatch_avg_cost
            print >> output_file, ('epoch %i, minibatch %i/%i, train_cost %f , train_error %.2f %%, decreasing rate %f %%, cost_decreasing rate %f %%' % \
                (epoch, minibatch_index + 1, n_train_batches,
                minibatch_avg_cost,
                train_err* 100.
                ,decreasing_rate
                ,c_d_rate))

            if best_train_loss > train_err:
                best_train_loss = train_err

            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [validate_model(i)
                                     for i in xrange(n_valid_batches)]
                this_validation_loss = numpy.mean(validation_losses)

                print >> output_file, ('epoch %i, minibatch %i/%i, validation error %f %%' % \
                    (epoch, minibatch_index + 1, n_train_batches,
                    this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                       improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss

                    # load trained_model to 
                    '''
                    layer_state = classifier.__getstate__()
                    trained_model_list = [layer_state]
                    trained_model_array = numpy.asarray(trained_model_list)
                    classifier_file = open(train_model_route, 'w')
                    cPickle.dump([1,2,3], classifier_file, protocol=2)
                    numpy.save(classifier_file, trained_model_array)
                    classifier_file.close()
                    '''
                    '''
                    test_losses = [test_model(i)
                                   for i in xrange(n_test_batches)]
                    test_score = numpy.mean(test_losses)
                    test_res = [test_results(i)
                                   for i in xrange(n_test_batches)]

                    print(('     epoch %i, minibatch %i/%i, test error of best'
                       ' model %f %%') %
                        (epoch, minibatch_index + 1, n_train_batches,
                         test_score * 100.))
                     '''

            if patience <= iter:
                done_looping = True
                break

    end_time = time.clock()
    print >> output_file, (('Optimization complete with best validation score of %f %%,'
           'with test performance %f %%'
           'with best train_performance %f %%') %
                 (best_validation_loss * 100., test_score * 100., best_train_loss * 100.))
    print >> output_file, 'The code run for %d epochs, with %f epochs/sec' % (
        epoch, 1. * epoch / (end_time - start_time))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.1fs' % ((end_time - start_time)))
    output_file.close()
Ejemplo n.º 20
0
    def __init__(self, rng, input, nkerns, batch_size, image_size, image_dimension):
        # Reshape matrix of rasterized images of shape (batch_size, size[0] * size[1])
        # to a 4D tensor, compatible with our LeNetConvPoolLayer
        self.layer0_input = input.reshape((batch_size, image_dimension, image_size[0], image_size[1]))

        # Construct the first convolutional pooling layer:
        # filtering reduces the image size to (100-3+1 , 100-3+1) = (98, 98)
        # 4D output tensor is thus of shape (batch_size, nkerns[0], 98, 98)
        self.layer0 = LeNetConvPoolLayer(
            rng,
            input=self.layer0_input,
            image_shape=(batch_size, image_dimension, image_size[0], image_size[1]),
            filter_shape=(nkerns[0], image_dimension, 3, 3),
            poolsize=(2, 2),
            pool_flag = False
        )


        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (98-3+1, 98-3+1) = (96, 96)
        # 4D output tensor is thus of shape (batch_size, nkerns[1], 96, 96)
        self.layer1 = LeNetConvPoolLayer(
            rng,
            input= self.layer0.output,
            image_shape=(batch_size, nkerns[0], 98, 98),
            filter_shape=(nkerns[1], nkerns[0], 3, 3),
            poolsize=(2, 2),
            pool_flag = False
        )

        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (96-5+1, 96-5+1) = (92, 92)
        # maxpooling reduces this further to (94/2, 94/2) = (46, 46)
        # 4D output tensor is thus of shape (batch_size, nkerns[2], 46, 46)
        self.layer2 = LeNetConvPoolLayer(
            rng,
            input = self.layer1.output,
            image_shape=(batch_size, nkerns[1], 96, 96),
            filter_shape=(nkerns[2], nkerns[1], 5, 5),
            poolsize=(2,2),
            pool_flag = True
        )

        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (46-7+1, 46-7+1) = (40, 40)
        # 4D output tensor is thus of shape (batch_size, nkerns[3], 40, 40)
        self.layer3 = LeNetConvPoolLayer(
            rng,
            input=self.layer2.output,
            image_shape=(batch_size, nkerns[2], 46, 46),
            filter_shape=(nkerns[3], nkerns[2], 7, 7),
            poolsize=(2, 2),
            pool_flag=False
        )

        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (40-7+1, 40-7+1) = (34, 34)
        # 4D output tensor is thus of shape (batch_size, nkerns[4], 34, 34)
        self.layer4 = LeNetConvPoolLayer(
            rng,
            input=self.layer3.output,
            image_shape=(batch_size, nkerns[3], 40, 40),
            filter_shape=(nkerns[4], nkerns[3], 7, 7),
            poolsize=(2, 2),
            pool_flag=False
        )

        # Construct the second convolutional pooling layer
        # filtering reduces the image size to (34-11+1, 34-11+1) = (24, 24)
        # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
        # 4D output tensor is thus of shape (batch_size, nkerns[5], 12, 12)
        self.layer5 = LeNetConvPoolLayer(
            rng,
            input=self.layer4.output,
            image_shape=(batch_size, nkerns[4], 34, 34),
            filter_shape=(nkerns[5], nkerns[4], 11, 11),
            poolsize=(2, 2),
            pool_flag=True
        )



        # the HiddenLayer being fully-connected, it operates on 2D matrices of
        # shape (batch_size, num_pixels) (i.e matrix of rasterized images).
        # This will generate a matrix of shape (batch_size, nkerns[5] * 12 * 12),
        # or (66, 20 * 12 * 12) = (66, 2880) with the default values.
        self.layer6_input = self.layer5.output.flatten(2)

        # construct a fully-connected sigmoidal layer
        self.layer6 = HiddenLayer(
            rng,
            input=self.layer6_input,
            n_in= nkerns[5] * 12 * 12,
            n_out=1500,
            activation= T.nnet.relu
        )

        self.layer7 = HiddenLayer(
            rng,
            input=self.layer6.output,
            n_in=1500,
            n_out=500,
            activation=T.nnet.relu
        )

        # classify the values of the fully-connected sigmoidal layer
        # self.layer4 = LogisticRegression(input=self.layer3.output, n_in=300, n_out=10)

        self.layer8 = LogisticRegression(input = self.layer7.output, n_in=500, n_out=2)

        # create a list of all model parameters to be fit by gradient descent
        self.params = self.layer8.params + self.layer7.params + self.layer6.params + \
                      self.layer5.params + self.layer4.params + self.layer3.params + self.layer2.params + self.layer1.params + self.layer0.params
    def __init__(self, input, batch_size, activation, state=None):

        #layer0_input = x.reshape((batch_size, 3, 13, 13))
        rng = np.random.RandomState(23455)

        img_size = 13
        img_channels = NUM_CHANNELS

        conv_filter_size = 3
        conv_filter_stride = 1  # hard coded
        conv_filter_depth = 16

        ## Not used becusee it is hardcoded  inside le-net
        pool_filter_size = 3
        pool_filter_stride = 2

        conv_pool_output_size = 5  ## 10

        fullyconnected_output_size = 16

        self.input = input

        if state is None:
            conv_pool_layer_state = None
            fully_connected_layer_state = None
            log_regression_layer_state = None
        else:
            conv_pool_layer_state = state[0:2]
            fully_connected_layer_state = state[2:4]
            log_regression_layer_state = state[4:6]

        self.conv_pool_layer = LeNetConvPoolLayer(
            rng,
            input=input,
            image_shape=(batch_size, img_channels, img_size, img_size),
            filter_shape=(conv_filter_depth, img_channels, conv_filter_size,
                          conv_filter_size),
            poolsize=(3, 3),
            activation=activation,
            state=conv_pool_layer_state)

        self.fullyconnected_layer = HiddenLayer(
            rng,
            input=self.conv_pool_layer.output.flatten(2),
            n_in=conv_filter_depth * conv_pool_output_size *
            conv_pool_output_size,
            n_out=fullyconnected_output_size,
            activation=activation,
            state=fully_connected_layer_state)

        self.log_regression_layer = LogisticRegression(
            input=self.fullyconnected_layer.output,
            n_in=fullyconnected_output_size,
            n_out=2,
            state=log_regression_layer_state)

        self.L1 = (abs(self.conv_pool_layer.W).sum() +
                   abs(self.fullyconnected_layer.W).sum() +
                   abs(self.log_regression_layer.W).sum())

        self.params = self.conv_pool_layer.params + self.fullyconnected_layer.params + self.log_regression_layer.params
Ejemplo n.º 22
0
        image_shape=(batch_size, nkerns[0], 20, 20),
        filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) \
        )

# the TanhLayer being fully-connected, it operates on 2D matrices of
# shape (batch_size,num_pixels) (i.e matrix of rasterized images).
# This will generate a matrix of shape (20,32*4*4) = (20,512)
layer2_input = layer1.output.flatten(2)

# construct a fully-connected sigmoidal layer
layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 8 * 8,
                     n_out=100, activation=T.tanh \
                     )

# classify the values of the fully-connected sigmoidal layer
layer3 = LogisticRegression(input=layer2.output, n_in=100, n_out=3 \
                            )

# definition for theano.function
test_results = theano.function(inputs=[x], \
    outputs= layer3.y_pred)


def load_trained_model():
    global if_load_trained_model
    global train_model_route
    global layer0_input
    global layer0
    global layer1
    global layer2_input
    global layer2
    global layer3
Ejemplo n.º 23
0
def evaluate_lenet5(learning_rate=0.01,
                    n_epochs=4,
                    emb_size=300,
                    batch_size=50,
                    describ_max_len=20,
                    type_size=12,
                    filter_size=[3, 5],
                    maxSentLen=100,
                    hidden_size=[300, 300]):

    model_options = locals().copy()
    print "model options", model_options
    emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/'
    test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt'
    output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_onlyMT_BBN_epoch4.json'
    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))
    word2id = {}
    # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen)  #minlen, include one label, at least one word in the sentence
    train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types(
        word2id, maxSentLen)
    train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others(
        word2id, maxSentLen)
    test_sents, test_masks, test_lines, word2id = load_official_testData_only_MT(
        word2id, maxSentLen, test_file_path)

    label_sent, label_mask = load_SF_type_descriptions(word2id, type_size,
                                                       describ_max_len)
    label_sent = np.asarray(label_sent, dtype='int32')
    label_mask = np.asarray(label_mask, dtype=theano.config.floatX)

    train_p1_sents = np.asarray(train_p1_sents, dtype='int32')
    train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX)
    train_p1_labels = np.asarray(train_p1_labels, dtype='int32')
    train_p1_size = len(train_p1_labels)

    train_p2_sents = np.asarray(train_p2_sents, dtype='int32')
    train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX)
    train_p2_labels = np.asarray(train_p2_labels, dtype='int32')
    train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32')
    train_p2_size = len(train_p2_labels)
    '''
    combine train_p1 and train_p2
    '''
    train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0)
    train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0)
    train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0)
    train_size = train_p1_size + train_p2_size

    test_sents = np.asarray(test_sents, dtype='int32')
    test_masks = np.asarray(test_masks, dtype=theano.config.floatX)
    # test_labels=np.asarray(all_labels[2], dtype='int32')
    test_size = len(test_sents)

    vocab_size = len(word2id) + 1  # add one zero pad index

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX)
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_fasttext_multiple_word2vec_given_file([
        emb_root + '100k-ENG-multicca.300.ENG.txt',
        emb_root + '100k-IL9-multicca.d300.IL9.txt'
    ], 300)
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    #now, start to build the input form of the model
    sents_id_matrix = T.imatrix('sents_id_matrix')
    sents_mask = T.fmatrix('sents_mask')
    labels = T.imatrix('labels')  #batch*12
    other_labels = T.imatrix()  #batch*4

    des_id_matrix = T.imatrix()
    des_mask = T.fmatrix()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    common_input = embeddings[sents_id_matrix.flatten()].reshape(
        (batch_size, maxSentLen, emb_size)).dimshuffle(
            0, 2, 1)  #the input format can be adapted into CNN or GRU or LSTM
    bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2)
    repeat_common_input = T.repeat(
        normalize_tensor3_colwise(common_input), type_size,
        axis=0)  #(batch_size*type_size, emb_size, maxsentlen)

    des_input = embeddings[des_id_matrix.flatten()].reshape(
        (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1)
    bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1),
                    axis=2)  #(tyope_size, emb_size)
    repeat_des_input = T.tile(
        normalize_tensor3_colwise(des_input),
        (batch_size, 1, 1))  #(batch_size*type_size, emb_size, maxsentlen)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    conv_W2, conv_b2 = create_conv_para(rng,
                                        filter_shape=(hidden_size[0], 1,
                                                      emb_size,
                                                      filter_size[1]))
    multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2]

    conv_att_W, conv_att_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    conv_att_W2, conv_att_b2 = create_conv_para(rng,
                                                filter_shape=(hidden_size[0],
                                                              1, emb_size,
                                                              filter_size[1]))
    conv_W_context2, conv_b_context2 = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))
    ACNN_para = [
        conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2,
        conv_W_context2
    ]
    '''
    multi-CNN
    '''
    conv_model = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    conv_model2 = Conv_with_Mask(
        rng,
        input_tensor3=common_input,
        mask_matrix=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        W=conv_W2,
        b=conv_b2
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings2 = conv_model2.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    '''
    GRU
    '''
    U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0])
    GRU_NN_para = [
        U1, W1, b1
    ]  #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias
    # gru_input = common_input.dimshuffle((0,2,1))   #gru requires input (batch_size, emb_size, maxSentLen)
    gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask,
                                                 hidden_size[0], U1, W1, b1)
    gru_sent_embeddings = gru_layer.output_sent_rep  # (batch_size, hidden_size)
    '''
    ACNN
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W,
        b=conv_att_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l

    attentive_conv_layer2 = Attentive_Conv_for_Pair(
        rng,
        origin_input_tensor3=common_input,
        origin_input_tensor3_r=common_input,
        input_tensor3=common_input,
        input_tensor3_r=common_input,
        mask_matrix=sents_mask,
        mask_matrix_r=sents_mask,
        image_shape=(batch_size, 1, emb_size, maxSentLen),
        image_shape_r=(batch_size, 1, emb_size, maxSentLen),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=conv_att_W2,
        b=conv_att_b2,
        W_context=conv_W_context2,
        b_context=conv_b_context2)
    sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l
    '''
    cross-DNN-dataless
    '''
    #first map label emb into hidden space
    HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para(
        rng, emb_size, hidden_size[0])
    HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b]
    HL_layer_1 = HiddenLayer(rng,
                             input=bow_des,
                             n_in=emb_size,
                             n_out=hidden_size[0],
                             W=HL_layer_1_W,
                             b=HL_layer_1_b,
                             activation=T.tanh)
    des_rep_hidden = HL_layer_1.output  #(type_size, hidden_size)
    dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot(
        des_rep_hidden.T))  #(batch_size, type_size)
    dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T))
    '''
    dataless cosine
    '''
    cosine_scores = normalize_matrix_rowwise(bow_emb).dot(
        normalize_matrix_rowwise(bow_des).T)
    cosine_score_matrix = T.nnet.sigmoid(
        cosine_scores)  #(batch_size, type_size)
    '''
    dataless top-30 fine grained cosine
    '''
    fine_grained_cosine = T.batched_dot(
        repeat_common_input.dimshuffle(0, 2, 1),
        repeat_des_input)  #(batch_size*type_size,maxsentlen,describ_max_len)
    fine_grained_cosine_to_matrix = fine_grained_cosine.reshape(
        (batch_size * type_size, maxSentLen * describ_max_len))
    sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix,
                                                axis=1)
    top_k_simi = sort_fine_grained_cosine_to_matrix[:,
                                                    -30:]  # (batch_size*type_size, 5)
    max_fine_grained_cosine = T.mean(top_k_simi, axis=1)
    top_k_cosine_scores = max_fine_grained_cosine.reshape(
        (batch_size, type_size))
    top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores)

    acnn_LR_input = T.concatenate([
        dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix,
        top_k_score_matrix, sent_embeddings, sent_embeddings2,
        gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb
    ],
                                  axis=1)
    acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12)
    acnn_LR_para = [acnn_U_a, acnn_LR_b]
    acnn_layer_LR = LogisticRegression(
        rng,
        input=acnn_LR_input,
        n_in=acnn_LR_input_size,
        n_out=12,
        W=acnn_U_a,
        b=acnn_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    acnn_score_matrix = T.nnet.sigmoid(
        acnn_layer_LR.before_softmax)  #batch * 12
    acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix,
                            acnn_score_matrix)
    acnn_loss = -T.mean(T.log(acnn_prob_pos))

    acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size,
                                                     16)
    acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b]
    acnn_other_layer_LR = LogisticRegression(rng,
                                             input=acnn_LR_input,
                                             n_in=acnn_LR_input_size,
                                             n_out=16,
                                             W=acnn_other_U_a,
                                             b=acnn_other_LR_b)
    acnn_other_prob_matrix = T.nnet.softmax(
        acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4)))
    acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape(
        (batch_size, 4, 4))
    acnn_other_prob = acnn_other_prob_tensor3[
        T.repeat(T.arange(batch_size), 4),
        T.tile(T.arange(4), (batch_size)),
        other_labels.flatten()]
    acnn_other_field_loss = -T.mean(T.log(acnn_other_prob))

    params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params  # put all model parameters together
    cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() +
                               (conv_att_W**2).sum() + (conv_att_W2**2).sum())
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    other_paras = params + acnn_other_LR_para
    cost_other = cost + acnn_other_field_loss
    other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate)
    '''
    testing
    '''

    ensemble_NN_scores = acnn_score_matrix  #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0)
    # '''
    # majority voting, does not work
    # '''
    # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0)
    # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0)
    # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0)
    # binarize_conc =  T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0)
    # sum_binarize_conc = T.sum(binarize_conc,axis=0)
    # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0)
    # '''
    # sum up prob, works
    # '''
    # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix
    # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0)
    '''
    sum up prob, works
    '''
    ensemble_scores = ensemble_NN_scores  #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix)
    binarize_prob = T.where(ensemble_scores > 0.3, 1, 0)
    '''
    test for other fields
    '''
    sum_tensor3 = acnn_other_prob_tensor3  #(batch, 4, 3)

    #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore')
    train_p1_model = theano.function(
        [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask],
        cost,
        updates=updates,
        allow_input_downcast=True,
        on_unused_input='ignore')
    train_p2_model = theano.function([
        sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask,
        other_labels
    ],
                                     cost_other,
                                     updates=other_updates,
                                     allow_input_downcast=True,
                                     on_unused_input='ignore')
    test_model = theano.function(
        [sents_id_matrix, sents_mask, des_id_matrix, des_mask],
        [binarize_prob, ensemble_scores, sum_tensor3],
        allow_input_downcast=True,
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]
    n_train_p2_batches = train_p2_size / batch_size
    train_p2_batch_start = list(np.arange(n_train_p2_batches) *
                                batch_size) + [train_p2_size - batch_size]
    n_test_batches = test_size / batch_size
    n_test_remain = test_size % batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]

    train_p2_batch_start_set = set(train_p2_batch_start)
    # max_acc_dev=0.0
    # max_meanf1_test=0.0
    # max_weightf1_test=0.0
    train_indices = range(train_size)
    train_p2_indices = range(train_p2_size)
    cost_i = 0.0
    other_cost_i = 0.0
    min_mean_frame = 100.0
    while epoch < n_epochs:
        epoch = epoch + 1
        random.Random(100).shuffle(train_indices)
        random.Random(100).shuffle(train_p2_indices)
        iter_accu = 0

        for batch_id in train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * n_train_batches + iter_accu + 1
            iter_accu += 1
            train_id_batch = train_indices[batch_id:batch_id + batch_size]

            cost_i += train_p1_model(train_sents[train_id_batch],
                                     train_masks[train_id_batch],
                                     train_labels[train_id_batch], label_sent,
                                     label_mask)

            if batch_id in train_p2_batch_start_set:
                train_p2_id_batch = train_p2_indices[batch_id:batch_id +
                                                     batch_size]
                other_cost_i += train_p2_model(
                    train_p2_sents[train_p2_id_batch],
                    train_p2_masks[train_p2_id_batch],
                    train_p2_labels[train_p2_id_batch], label_sent, label_mask,
                    train_p2_other_labels[train_p2_id_batch])
            # else:
            #     random_batch_id = random.choice(train_p2_batch_start)
            #     train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size]
            #     other_cost_i+=train_p2_model(
            #                         train_p2_sents[train_p2_id_batch],
            #                         train_p2_masks[train_p2_id_batch],
            #                         train_p2_labels[train_p2_id_batch],
            #                         label_sent,
            #                         label_mask,
            #                         train_p2_other_labels[train_p2_id_batch]
            #                         )
            #after each 1000 batches, we test the performance of the model on all test data
            if iter % 20 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), str(
                        other_cost_i /
                        iter), 'uses ', (time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                pred_types = []
                pred_confs = []
                pred_others = []
                for i, test_batch_id in enumerate(
                        test_batch_start):  # for each test batch
                    pred_types_i, pred_conf_i, pred_fields_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_masks[test_batch_id:test_batch_id + batch_size],
                        label_sent, label_mask)
                    if i < len(test_batch_start) - 1:
                        pred_types.append(pred_types_i)
                        pred_confs.append(pred_conf_i)
                        pred_others.append(pred_fields_i)
                    else:
                        pred_types.append(pred_types_i[-n_test_remain:])
                        pred_confs.append(pred_conf_i[-n_test_remain:])
                        pred_others.append(pred_fields_i[-n_test_remain:])
                pred_types = np.concatenate(pred_types, axis=0)
                pred_confs = np.concatenate(pred_confs, axis=0)
                pred_others = np.concatenate(pred_others, axis=0)
                mean_frame = generate_2018_official_output(
                    test_lines, output_file_path, pred_types, pred_confs,
                    pred_others, min_mean_frame)
                if mean_frame < min_mean_frame:
                    min_mean_frame = mean_frame
                print '\t\t\t test  over, min_mean_frame:', min_mean_frame

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
Ejemplo n.º 24
0
    def __init__(
        self,
        numpy_rng,
        theano_rng=None,
        n_ins=784,
        hidden_layers_sizes=[500, 500],
        n_outs=10,
        corruption_levels=[0.1, 0.1]
    ):
        """ This class is made to support a variable number of layers.

        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                    weights

        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_ins: int
        :param n_ins: dimension of the input to the sdA

        :type n_layers_sizes: list of ints
        :param n_layers_sizes: intermediate layers size, must contain
                               at least one value

        :type n_outs: int
        :param n_outs: dimension of the output of the network

        :type corruption_levels: list of float
        :param corruption_levels: amount of corruption to use for each
                                  layer
        """

        self.sigmoid_layers = []
        self.dA_layers = []
        self.params = []
        self.n_layers = len(hidden_layers_sizes)

        assert self.n_layers > 0

        if not theano_rng:
            theano_rng = RandomStreams(numpy_rng.randint(2 ** 30))
        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
                                 # [int] labels
        # end-snippet-1

        # The SdA is an MLP, for which all weights of intermediate layers
        # are shared with a different denoising autoencoders
        # We will first construct the SdA as a deep multilayer perceptron,
        # and when constructing each sigmoidal layer we also construct a
        # denoising autoencoder that shares weights with that layer
        # During pretraining we will train these autoencoders (which will
        # lead to chainging the weights of the MLP as well)
        # During finetunining we will finish training the SdA by doing
        # stochastich gradient descent on the MLP

        # start-snippet-2
        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_ins
            else:
                input_size = hidden_layers_sizes[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.sigmoid_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=numpy_rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=hidden_layers_sizes[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.sigmoid_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct a denoising autoencoder that shared weights with this
            # layer
            dA_layer = dA(numpy_rng=numpy_rng,
                          theano_rng=theano_rng,
                          input=layer_input,
                          n_visible=input_size,
                          n_hidden=hidden_layers_sizes[i],
                          W=sigmoid_layer.W,
                          bhid=sigmoid_layer.b)
            self.dA_layers.append(dA_layer)
        # end-snippet-2
        # We now need to add a logistic layer on top of the MLP
        self.logLayer = LogisticRegression(
            input=self.sigmoid_layers[-1].output,
            n_in=hidden_layers_sizes[-1],
            n_out=n_outs
        )

        self.params.extend(self.logLayer.params)
        # construct a function that implements one step of finetunining

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.finetune_cost = self.logLayer.negative_log_likelihood(self.y)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logLayer.errors(self.y)
Ejemplo n.º 25
0
Archivo: DBN.py Proyecto: wenmm/DECRES
    def __init__(self,
                 rng,
                 n_in=784,
                 n_hidden=[500, 500],
                 n_out=10,
                 lambda_reg=0.001,
                 alpha_reg=0.001):
        """This class is made to support a variable number of layers.
    
        :type numpy_rng: numpy.random.RandomState
        :param numpy_rng: numpy random number generator used to draw initial
                   weights
    
        :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams
        :param theano_rng: Theano random generator; if None is given one is
                           generated based on a seed drawn from `rng`

        :type n_in: int
        :param n_in: dimension of the input to the DBN
    
        :type n_hidden: list of ints
        :param n_hidden: intermediate layers size, must contain
                               at least one value

        :type n_out: int
        :param n_out: dimension of the output of the network
       
        :type lambda_reg: float
        :param lambda_reg: paramter to control the sparsity of weights by l_1 norm.
         The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ).
         Thus, the larger lambda_reg is, the sparser the weights are.
        
        :type alpha_reg: float
        :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm.
         The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ),
         Thus, the smaller alpha_reg is, the smoother the weights are.
        """

        self.hidden_layers = []
        self.rbm_layers = []
        self.params = []
        self.n_layers = len(n_hidden)

        assert self.n_layers > 0

        # allocate symbolic variables for the data
        self.x = T.matrix('x')  # the data, each row is a sample
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        # [int] labels

        for i in xrange(self.n_layers):
            # construct the sigmoidal layer

            # the size of the input is either the number of hidden units of
            # the layer below or the input size if we are on the first layer
            if i == 0:
                input_size = n_in
            else:
                input_size = n_hidden[i - 1]

            # the input to this layer is either the activation of the hidden
            # layer below or the input of the SdA if you are on the first
            # layer
            if i == 0:
                layer_input = self.x
            else:
                layer_input = self.hidden_layers[-1].output

            sigmoid_layer = HiddenLayer(rng=rng,
                                        input=layer_input,
                                        n_in=input_size,
                                        n_out=n_hidden[i],
                                        activation=T.nnet.sigmoid)
            # add the layer to our list of layers
            self.hidden_layers.append(sigmoid_layer)
            # its arguably a philosophical question...
            # but we are going to only declare that the parameters of the
            # sigmoid_layers are parameters of the StackedDAA
            # the visible biases in the dA are parameters of those
            # dA, but not the SdA
            self.params.extend(sigmoid_layer.params)

            # Construct an RBM that shared weights with this layer
            rbm_layer = RBM(numpy_rng=rng,
                            theano_rng=None,
                            input=layer_input,
                            n_visible=input_size,
                            n_hidden=n_hidden[i],
                            W=sigmoid_layer.W,
                            hbias=sigmoid_layer.b)
            self.rbm_layers.append(rbm_layer)

        # We now need to add a logistic layer on top of the MLP
        if self.n_layers > 0:
            self.logRegressionLayer = LogisticRegression(
                input=self.hidden_layers[-1].output,
                n_in=n_hidden[-1],
                n_out=n_out)
        else:
            self.logRegressionLayer = LogisticRegression(input=self.x,
                                                         n_in=input_size,
                                                         n_out=n_out)

        self.params.extend(self.logRegressionLayer.params)

        # regularization
        L1s = []
        L2_sqrs = []
        for i in range(self.n_layers):
            L1s.append(abs(self.hidden_layers[i].W).sum())
            L2_sqrs.append((self.hidden_layers[i].W**2).sum())
        L1s.append(abs(self.logRegressionLayer.W).sum())
        L2_sqrs.append((self.logRegressionLayer.W**2).sum())
        self.L1 = T.sum(L1s)
        self.L2_sqr = T.sum(L2_sqrs)

        # compute the cost for second phase of training,
        # defined as the negative log likelihood
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood(
            self.y)
        self.cost=self.negative_log_likelihood + \
        lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr +  alpha_reg*self.L1)
        # compute the gradients with respect to the model parameters
        # symbolic variable that points to the number of errors made on the
        # minibatch given by self.x and self.y
        self.errors = self.logRegressionLayer.errors(self.y)
        self.y_pred = self.logRegressionLayer.y_pred
        self.y_pred_prob = self.logRegressionLayer.y_pred_prob
Ejemplo n.º 26
0
    def __init__(self,
                 rng,
                 batch_size=100,
                 input_size=None,
                 nkerns=[4, 4, 4],
                 receptive_fields=((2, 8), (2, 8), (2, 8)),
                 poolsizes=((1, 8), (1, 8), (1, 4)),
                 full_hidden=[16],
                 n_out=10):
        """
        
        """
        self.x = T.matrix(name='x', dtype=theano.config.floatX
                          )  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
        self.batch_size = theano.shared(
            value=batch_size, name='batch_size')  #T.lscalar('batch_size')

        self.layers = []
        self.params = []
        for i in range(len(nkerns)):
            receptive_field = receptive_fields[i]
            if i == 0:
                featmap_size_after_downsample = input_size
                layeri_input = self.x.reshape(
                    (batch_size, 1, featmap_size_after_downsample[0],
                     featmap_size_after_downsample[1]))
                image_shape = (batch_size, 1, featmap_size_after_downsample[0],
                               featmap_size_after_downsample[1])
                filter_shape = (nkerns[i], 1, receptive_field[0],
                                receptive_field[1])
            else:
                layeri_input = self.layers[i - 1].output
                image_shape = (batch_size, nkerns[i - 1],
                               featmap_size_after_downsample[0],
                               featmap_size_after_downsample[1])
                filter_shape = (nkerns[i], nkerns[i - 1], receptive_field[0],
                                receptive_field[1])

            layeri = LeNetConvPoolLayer(rng=rng,
                                        input=layeri_input,
                                        image_shape=image_shape,
                                        filter_shape=filter_shape,
                                        poolsize=poolsizes[i])
            featmap_size_after_conv = get_featmap_size_after_conv(
                featmap_size_after_downsample, receptive_fields[i])
            featmap_size_after_downsample = get_featmap_size_after_downsample(
                featmap_size_after_conv, poolsizes[i])
            self.layers.append(layeri)
            self.params.extend(layeri.params)

        # fully connected layer
        #print 'going to fully connected layer'
        #layer_full_input = self.layers[-1].output.flatten(2)

        # construct a fully-connected sigmoidal layer
        #layer_full = HiddenLayer(rng=rng, input=layer_full_input,
        #                         n_in=nkerns[-1] * featmap_size_after_downsample[0] * featmap_size_after_downsample[1],
        #                         n_out=full_hidden, activation=T.tanh)
        #self.layers.append(layer_full)
        #self.params.extend(layer_full.params)

        # classify the values of the fully-connected sigmoidal layer
        #print 'going to output layer'
        #self.logRegressionLayer = LogisticRegression(input=self.layers[-1].output, n_in=full_hidden, n_out=n_out)
        #self.params.extend(self.logRegressionLayer.params)

        # multiple fully connected layers
        print 'going to fully connected layers'
        for i in range(len(full_hidden)):
            if i == 0:
                layer_full_i_input = self.layers[-1].output.flatten(
                    2)  # the output of the last conv-pool layer
                n_i_in = nkerns[-1] * featmap_size_after_downsample[
                    0] * featmap_size_after_downsample[1]
                n_i_out = full_hidden[i]
            else:
                layer_full_i_input = layer_full_i_output
                n_i_in = full_hidden[i - 1]
                n_i_out = full_hidden[i]

            layer_full_i = HiddenLayer(rng=rng,
                                       input=layer_full_i_input,
                                       n_in=n_i_in,
                                       n_out=n_i_out,
                                       activation=T.tanh)
            self.layers.append(layer_full_i)
            self.params.extend(layer_full_i.params)
            layer_full_i_output = layer_full_i.output
        #self.output=layer_full_i_output

        # construct an output layer (classes)
        print 'going to output layer'
        self.logRegressionLayer = LogisticRegression(
            input=self.layers[-1].output, n_in=full_hidden[-1], n_out=n_out)
        self.params.extend(self.logRegressionLayer.params)

        # the cost we minimize during training is the NLL of the model
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood(
            self.y)
        self.cost = self.logRegressionLayer.negative_log_likelihood(self.y)
        self.errors = self.logRegressionLayer.errors(self.y)
        self.y_pred = self.logRegressionLayer.y_pred
        self.y_pred_prob = self.logRegressionLayer.y_pred_prob
Ejemplo n.º 27
0
    def fit(self, X, Y):
        """
        Demonstrate stochastic gradient descent optimization for a multilayer
        perceptron
      
        This is demonstrated on MNIST.
      
        :type learning_rate: float
        :param learning_rate: learning rate used (factor for the stochastic
        gradient
      
        :type L1_reg: float
        :param L1_reg: L1-norm's weight when added to the cost (see
        regularization)
      
        :type L2_reg: float
        :param L2_reg: L2-norm's weight when added to the cost (see
        regularization)
      
        :type n_epochs: int
        :param n_epochs: maximal number of epochs to run the optimizer
      
        :type dataset: string
        :param dataset: the path of the MNIST dataset file from
                     http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz
      
      
       """
        
        
        if (self.best_error is  None):
            train_set_x, train_set_y = shared_dataset((X[0], Y[0]-1), borrow=True)
            valid_set_x, valid_set_y = shared_dataset((X[1], Y[1]-1), borrow=True)
            n_classes = len(set(Y[0]))
            input_dimension = X[0].shape[1]
            validation_size = Y[1].shape[0]
            train_size = X[0].shape[0]

        else:
            Y = numpy.array(Y)
            train_set_x, train_set_y = shared_dataset((X, Y-1), borrow=True)
            n_classes = len(set(Y))
            input_dimension = X.shape[1]
            train_size = X.shape[0]
        
      
        n_batches = 1
      
        ######################
        # BUILD ACTUAL MODEL #
        ######################
#         print '... building the model'
      
        # allocate symbolic variables for the data
        index = T.lscalar()  # index to a [mini]batch
        self.x = T.matrix('x')  # the data is presented as rasterized images
        self.y = T.ivector('y')  # the labels are presented as 1D vector of
                            # [int] labels
      
        rng = numpy.random.RandomState(1234)
      
        # construct the MLP class
#         classifier = MLP(rng=rng, input=x, n_in=datasets[0].shape[1],
#                          n_hidden=n_hidden, n_out=n_classes)
#         
        # Since we are dealing with a one hidden layer MLP, this will
        # translate into a TanhLayer connected to the LogisticRegression
        # layer; this can be replaced by a SigmoidalLayer, or a layer
        # implementing any other nonlinearity
        self.hiddenLayer = HiddenLayer(rng=rng, input=self.x,
                                       n_in= input_dimension, n_out=self.n_hidden,
                                       activation=T.tanh)

        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        self.logRegressionLayer = LogisticRegression(
            input=self.hiddenLayer.output,
            n_in=self.n_hidden,
            n_out=n_classes)

        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = abs(self.hiddenLayer.W).sum() \
                + abs(self.logRegressionLayer.W).sum()

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = (self.hiddenLayer.W ** 2).sum() \
                    + (self.logRegressionLayer.W ** 2).sum()

        # negative log likelihood of the MLP is given by the negative
        # log likelihood of the output of the model, computed in the
        # logistic regression layer
        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors

        # the parameters of the model are the parameters of the two layer it is
        # made out of
        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
        
        self.probabilities = self.logRegressionLayer.p_y_given_x

        # the cost we minimize during training is the negative log likelihood of
        # the model plus the regularization terms (L1 and L2); cost is expressed
        # here symbolically
        cost = self.negative_log_likelihood(self.y) \
             + self.L1_reg * self.L1 \
             + self.L2_reg * self.L2_sqr
      
        # compiling a Theano function that computes the mistakes that are made
        # by the model on a minibatch
    #     test_model = theano.function(inputs=[index],
    #             outputs=classifier.errors(y),
    #             givens={
    #                 x: test_set_x[index * batch_size:(index + 1) * batch_size],
    #                 y: test_set_y[index * batch_size:(index + 1) * batch_size]})
      
        
        if (self.best_error is  None):
            self.validate_model = theano.function(inputs=[],
                    outputs=(self.errors(self.y),self.probabilities),
                    givens={
                        self.x: valid_set_x,
                        self.y: valid_set_y})
      
        # compute the gradient of cost with respect to theta (sotred in params)
        # the resulting gradients will be stored in a list gparams
        gparams = []
        for param in self.params:
            gparam = T.grad(cost, param)
            gparams.append(gparam)
      
        # specify how to update the parameters of the model as a list of
        # (variable, update expression) pairs
        updates = []
        # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
        # same length, zip generates a list C of same size, where each element
        # is a pair formed from the two lists :
        #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
        for param, gparam in zip(self.params, gparams):
            updates.append((param, param - self.learning_rate * gparam))
      
        # compiling a Theano function `train_model` that returns the cost, but
        # in the same time updates the parameter of the model based on the rules
        # defined in `updates`
        self.train_model = theano.function(inputs=[], outputs=cost,
                updates=updates,
                givens={
                    self.x: train_set_x,
                    self.y: train_set_y})
      
        ###############
        # TRAIN MODEL #
        ###############
#         print '... training'
      
        # early-stopping parameters
        patience = 10000  # look as this many examples regardless
        patience_increase = 2  # wait this much longer when a new best is
                               # found
        improvement_threshold = 0.995  # a relative improvement of this much is
                                       # considered significant
        validation_frequency = 1
                                      # go through this many
                                      # minibatche before checking the network
                                      # on the validation set; in this case we
                                      # check every epoch
      
        best_params = None
        best_validation_loss = numpy.inf
        best_iter = 0
        test_score = 0.
        start_time = time.clock()
      
        epoch = 0
        done_looping = False
        
        errors_for_plot = numpy.zeros(self.n_epochs)
        
        
        while (epoch < self.n_epochs) and (not done_looping):
            epoch = epoch + 1
    #         for minibatch_index in xrange(n_train_batches):
    #         minibatch_index = 0
            minibatch_avg_cost = self.train_model()
#             print 'trainig error: ', minibatch_avg_cost
            if not (self.best_error is None) and minibatch_avg_cost <= self.best_error:#this is for test phase
#                 validation_losses, my_probs = self.validate_model()
                break
            elif self.best_error is None:
                # iteration number
                iter = (epoch - 1) * n_batches
         
                if (iter + 1) % validation_frequency == 0:
                   # compute zero-one loss on validation set
                   validation_losses, my_probs = self.validate_model()
                     
                   this_validation_loss = numpy.mean(validation_losses)
                   errors_for_plot[epoch-1] = this_validation_loss
                   
#                    print('epoch %i, minibatch %i/%i, validation error %f %%' %
#                         (epoch, 1, n_batches,
#                          this_validation_loss * 100.))
#          
                   # if we got the best validation score until now
                   if this_validation_loss < best_validation_loss:
                       #improve patience if loss improvement is good enough
                       if this_validation_loss < best_validation_loss *  \
                              improvement_threshold:
                           patience = max(patience, iter * patience_increase)
         
                       best_validation_loss = this_validation_loss
                       best_iter = iter
         
                       # test it on the test set
        #                     test_losses = [test_model(i) for i
        #                                    in xrange(n_test_batches)]
        #                     test_score = numpy.mean(test_losses)
         
        #                print(('     epoch %i, minibatch %i/%i, test error of '
        #                       'best model %f %%') %
        #                      (epoch, minibatch_index + 1, n_train_batches,
        #                       test_score * 100.))
         
                if patience <= iter:
                       done_looping = True
                       break
          
        end_time = time.clock()

        
#         print(('Optimization complete. Best validation score of %f %% '
#                'obtained at iteration %i') %
#               (best_validation_loss * 100., best_iter + 1))
#          
#         print >> sys.stderr, ('The code for file ' +
#                               os.path.split(__file__)[1] +
#                               ' ran for %.2fm' % ((end_time - start_time) / 60.))
#         
#         plt.plot(numpy.arange(100), errors_for_plot)
#         plt.show()

        return best_validation_loss
def evaluate_lenet5(learning_rate=0.02,
                    n_epochs=100,
                    emb_size=300,
                    batch_size=50,
                    filter_size=[3],
                    sent_len=40,
                    claim_len=40,
                    cand_size=10,
                    hidden_size=[300, 300],
                    max_pred_pick=5):

    model_options = locals().copy()
    print "model options", model_options

    pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'}

    seed = 1234
    np.random.seed(seed)
    rng = np.random.RandomState(
        seed)  #random seed, control the model generates the same results
    srng = T.shared_randomstreams.RandomStreams(rng.randint(seed))

    "load raw data"
    train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train(
        sent_len, claim_len, cand_size)
    train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)
    test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev(
        sent_len, claim_len, cand_size, word2id)
    test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo(
        sent_len, claim_len, cand_size, word2id)

    train_sents = np.asarray(train_sents, dtype='int32')
    train_3th_sents = np.asarray(train_3th_sents, dtype='int32')
    joint_train_sents = np.concatenate((train_sents, train_3th_sents))
    test_sents = np.asarray(test_sents, dtype='int32')
    test_3th_sents = np.asarray(test_3th_sents, dtype='int32')
    joint_test_sents = np.concatenate((test_sents, test_3th_sents))

    train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX)
    train_3th_sent_masks = np.asarray(train_3th_sent_masks,
                                      dtype=theano.config.floatX)
    joint_train_sent_masks = np.concatenate(
        (train_sent_masks, train_3th_sent_masks))
    test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX)
    test_3th_sent_masks = np.asarray(test_3th_sent_masks,
                                     dtype=theano.config.floatX)
    joint_test_sent_masks = np.concatenate(
        (test_sent_masks, test_3th_sent_masks))

    train_sent_labels = np.asarray(train_sent_labels, dtype='int32')
    train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32')
    joint_train_sent_labels = np.concatenate(
        (train_sent_labels, train_3th_sent_labels))
    test_sent_labels = np.asarray(test_sent_labels, dtype='int32')
    test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32')
    joint_test_sent_labels = np.concatenate(
        (test_sent_labels, test_3th_sent_labels))

    train_claims = np.asarray(train_claims, dtype='int32')
    train_3th_claims = np.asarray(train_3th_claims, dtype='int32')
    joint_train_claims = np.concatenate((train_claims, train_3th_claims))
    test_claims = np.asarray(test_claims, dtype='int32')
    test_3th_claims = np.asarray(test_3th_claims, dtype='int32')
    joint_test_claims = np.concatenate((test_claims, test_3th_claims))

    train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX)
    train_3th_claim_mask = np.asarray(train_3th_claim_mask,
                                      dtype=theano.config.floatX)
    joint_train_claim_mask = np.concatenate(
        (train_claim_mask, train_3th_claim_mask))
    test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX)
    test_3th_claim_mask = np.asarray(test_3th_claim_mask,
                                     dtype=theano.config.floatX)
    joint_test_claim_mask = np.concatenate(
        (test_claim_mask, test_3th_claim_mask))

    train_labels = np.asarray(train_labels, dtype='int32')
    train_3th_labels = np.asarray(train_3th_labels, dtype='int32')
    joint_train_labels = np.concatenate((train_labels, train_3th_labels))
    test_labels = np.asarray(test_labels, dtype='int32')
    test_3th_labels = np.asarray(test_3th_labels, dtype='int32')
    joint_test_labels = np.concatenate((test_labels, test_3th_labels))

    joint_train_size = len(joint_train_claims)
    joint_test_size = len(joint_test_claims)
    train_size = len(train_claims)
    test_size = len(test_claims)
    test_3th_size = len(test_3th_claims)
    vocab_size = len(word2id) + 1
    print 'joint_train size: ', joint_train_size, ' joint_test size: ', joint_test_size
    print 'train size: ', train_size, ' test size: ', test_size
    print 'vocab size: ', vocab_size

    rand_values = rng.normal(
        0.0, 0.01,
        (vocab_size, emb_size))  #generate a matrix by Gaussian distribution
    id2word = {y: x for x, y in word2id.iteritems()}
    word2vec = load_word2vec()
    rand_values = load_word2vec_to_init(rand_values, id2word, word2vec)
    init_embeddings = theano.shared(
        value=np.array(rand_values, dtype=theano.config.floatX), borrow=True
    )  #wrap up the python variable "rand_values" into theano variable

    "now, start to build the input form of the model"
    sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    sents_mask = T.ftensor3()
    sents_labels = T.imatrix()  #(batch, cand_size)
    claim_ids = T.imatrix()  #(batch, claim_len)
    claim_mask = T.fmatrix()

    joint_sents_ids = T.itensor3()  #(batch, cand_size, sent_len)
    joint_sents_mask = T.ftensor3()
    joint_sents_labels = T.imatrix()  #(batch, cand_size)
    joint_claim_ids = T.imatrix()  #(batch, claim_len)
    joint_claim_mask = T.fmatrix()
    joint_labels = T.ivector()
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    embed_input_sents = init_embeddings[sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    embed_input_claim = init_embeddings[claim_ids.flatten()].reshape(
        (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)

    conv_W, conv_b = create_conv_para(rng,
                                      filter_shape=(hidden_size[0], 1,
                                                    emb_size, filter_size[0]))
    att_conv_W, att_conv_b = create_conv_para(rng,
                                              filter_shape=(hidden_size[0], 1,
                                                            emb_size,
                                                            filter_size[0]))
    conv_W_context, conv_b_context = create_conv_para(
        rng, filter_shape=(hidden_size[0], 1, emb_size, 1))

    NN_para = [conv_W, conv_b, att_conv_W, att_conv_b, conv_W_context]

    conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_sents,
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    sent_embeddings = conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_sent_emb = sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))

    conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=embed_input_claim,
        mask_matrix=claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    claim_embeddings = conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size
    batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1),
                               cand_size,
                               axis=1)
    '''
    attentive conv
    '''
    attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(
        rng,
        input_tensor3=
        embed_input_sents,  #batch_size*cand_size, emb_size, sent_len
        input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0),
        mask_matrix=sents_mask.reshape(
            (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])),
        mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        filter_shape_context=(hidden_size[0], 1, emb_size, 1),
        W=att_conv_W,
        b=att_conv_b,
        W_context=conv_W_context,
        b_context=conv_b_context)
    attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l  #(batch_size*cand_size, hidden_size)
    attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r
    # concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb ], axis=2)
    # concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2))
    concate_claim_sent = T.concatenate([
        batch_claim_emb, batch_sent_emb,
        T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x')
    ],
                                       axis=2)
    concate_2_matrix = concate_claim_sent.reshape(
        (batch_size * cand_size, hidden_size[0] * 2 + 1))

    LR_input = T.concatenate([
        concate_2_matrix, attentive_sent_embeddings_l,
        attentive_sent_embeddings_r
    ],
                             axis=1)
    LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2
    #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative
    U_a = create_ensemble_para(
        rng, 1, LR_input_size)  # the weight matrix hidden_size*2
    # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True)  #bias for each target class
    LR_para = [U_a]
    # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector
    score_matrix = T.nnet.sigmoid(LR_input.dot(U_a))  #batch * 12
    inter_matrix = score_matrix.reshape((batch_size, cand_size))

    # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1)
    # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size)))
    '''
    maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix)
    '''
    # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix)
    # loss = -T.mean(T.log(prob_pos))
    #f1 as loss
    batch_overlap = T.sum(sents_labels * inter_matrix, axis=1)
    batch_recall = batch_overlap / T.sum(sents_labels, axis=1)
    batch_precision = batch_overlap / T.sum(inter_matrix, axis=1)
    batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall +
                                                       batch_precision)
    loss = -T.mean(T.log(batch_f1))
    # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean()
    '''
    training task2, predict 3 labels
    '''
    joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten(
    )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle(
        0, 2, 1
    )  #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM
    joint_embed_input_claim = init_embeddings[
        joint_claim_ids.flatten()].reshape(
            (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1)
    joint_conv_model_sents = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_sents,
        mask_matrix=joint_sents_mask.reshape(
            (joint_sents_mask.shape[0] * joint_sents_mask.shape[1],
             joint_sents_mask.shape[2])),
        image_shape=(batch_size * cand_size, 1, emb_size, sent_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_sent_embeddings = joint_conv_model_sents.maxpool_vec  #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size
    joint_batch_sent_emb = joint_sent_embeddings.reshape(
        (batch_size, cand_size, hidden_size[0]))
    joint_premise_emb = T.sum(joint_batch_sent_emb *
                              joint_sents_labels.dimshuffle(0, 1, 'x'),
                              axis=1)  #(batch, hidden_size)

    joint_conv_model_claims = Conv_with_Mask(
        rng,
        input_tensor3=joint_embed_input_claim,
        mask_matrix=joint_claim_mask,
        image_shape=(batch_size, 1, emb_size, claim_len),
        filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]),
        W=conv_W,
        b=conv_b
    )  #mutiple mask with the conv_out to set the features by UNK to zero
    joint_claim_embeddings = joint_conv_model_claims.maxpool_vec  #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size

    joint_premise_hypo_emb = T.concatenate(
        [joint_premise_emb, joint_claim_embeddings],
        axis=1)  #(batch, 2*hidden_size)

    "Logistic Regression layer"
    joint_LR_input = joint_premise_hypo_emb  #T.concatenate([attentive_sent_embeddings_l,attentive_sent_embeddings_r,attentive_sent_embeddings_l+attentive_sent_embeddings_r,attentive_sent_embeddings_l*attentive_sent_embeddings_r],axis=1)
    joint_LR_input_size = 2 * hidden_size[0]

    joint_U_a = create_ensemble_para(rng, 3,
                                     joint_LR_input_size)  # (input_size, 3)
    joint_LR_b = theano.shared(value=np.zeros((3, ),
                                              dtype=theano.config.floatX),
                               name='LR_b',
                               borrow=True)  #bias for each target class
    joint_LR_para = [joint_U_a, joint_LR_b]

    joint_layer_LR = LogisticRegression(
        rng,
        input=joint_LR_input,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector
    joint_loss = joint_layer_LR.negative_log_likelihood(
        joint_labels
    )  #for classification task, we usually used negative log likelihood as loss, the lower the better.
    '''
    testing
    '''
    # binarize_prob = T.where( inter_matrix > 0.5, 1, 0)  #(batch_size, cand_size

    masked_inter_matrix = inter_matrix * sents_labels  #(batch, cand_size)
    test_premise_emb = T.sum(batch_sent_emb *
                             masked_inter_matrix.dimshuffle(0, 1, 'x'),
                             axis=1)
    test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings],
                                          axis=1)
    test_layer_LR = LogisticRegression(
        rng,
        input=test_premise_hypo_emb,
        n_in=joint_LR_input_size,
        n_out=3,
        W=joint_U_a,
        b=joint_LR_b
    )  #basically it is a multiplication between weight matrix and input feature vector

    params = [init_embeddings] + NN_para + LR_para + joint_LR_para
    cost = loss + joint_loss
    "Use AdaGrad to update parameters"
    updates = Gradient_Cost_Para(cost, params, learning_rate)

    train_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids,
        joint_claim_mask, joint_labels
    ],
                                  cost,
                                  updates=updates,
                                  allow_input_downcast=True,
                                  on_unused_input='ignore')
    # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore')
    test_model = theano.function([
        sents_ids, sents_mask, sents_labels, claim_ids, claim_mask,
        joint_labels
    ], [
        inter_matrix,
        test_layer_LR.errors(joint_labels), test_layer_LR.y_pred
    ],
                                 allow_input_downcast=True,
                                 on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 50000000000  # look as this many examples regardless
    start_time = time.time()
    mid_time = start_time
    past_time = mid_time
    epoch = 0
    done_looping = False

    joint_n_train_batches = joint_train_size / batch_size
    joint_train_batch_start = list(
        np.arange(joint_n_train_batches) *
        batch_size) + [joint_train_size - batch_size]
    n_train_batches = train_size / batch_size
    train_batch_start = list(
        np.arange(n_train_batches) * batch_size) + [train_size - batch_size]

    n_test_batches = test_size / batch_size
    test_batch_start = list(
        np.arange(n_test_batches) * batch_size) + [test_size - batch_size]
    n_test_3th_batches = test_3th_size / batch_size
    test_3th_batch_start = list(np.arange(n_test_3th_batches) *
                                batch_size) + [test_3th_size - batch_size]

    max_acc = 0.0
    max_test_f1 = 0.0
    max_acc_full_evi = 0.0

    cost_i = 0.0
    joint_train_indices = range(joint_train_size)
    train_indices = range(train_size)

    while epoch < n_epochs:
        epoch = epoch + 1

        random.Random(100).shuffle(
            joint_train_indices
        )  #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed
        random.Random(100).shuffle(train_indices)
        iter_accu = 0

        for joint_batch_id in joint_train_batch_start:  #for each batch
            # iter means how many batches have been run, taking into loop
            iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1
            iter_accu += 1
            joint_train_id_batch = joint_train_indices[
                joint_batch_id:joint_batch_id + batch_size]
            for i in range(3):
                batch_id = random.choice(train_batch_start)
                train_id_batch = train_indices[batch_id:batch_id + batch_size]
                cost_i += train_model(
                    train_sents[train_id_batch],
                    train_sent_masks[train_id_batch],
                    train_sent_labels[train_id_batch],
                    train_claims[train_id_batch],
                    train_claim_mask[train_id_batch],
                    #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels
                    joint_train_sents[joint_train_id_batch],
                    joint_train_sent_masks[joint_train_id_batch],
                    joint_train_sent_labels[joint_train_id_batch],
                    joint_train_claims[joint_train_id_batch],
                    joint_train_claim_mask[joint_train_id_batch],
                    joint_train_labels[joint_train_id_batch])

            #after each 1000 batches, we test the performance of the model on all test data
            # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0):
            if iter % 100 == 0:
                print 'Epoch ', epoch, 'iter ' + str(
                    iter) + ' average cost: ' + str(cost_i / iter), 'uses ', (
                        time.time() - past_time) / 60.0, 'min'
                past_time = time.time()

                f1_sum = 0.0
                error_sum = 0.0
                full_evi = 0
                predictions = []
                for test_batch_id in test_batch_start:  # for each test batch
                    batch_prob, error_i, pred_i = test_model(
                        test_sents[test_batch_id:test_batch_id + batch_size],
                        test_sent_masks[test_batch_id:test_batch_id +
                                        batch_size],
                        test_sent_labels[test_batch_id:test_batch_id +
                                         batch_size],
                        test_claims[test_batch_id:test_batch_id + batch_size],
                        test_claim_mask[test_batch_id:test_batch_id +
                                        batch_size],
                        test_labels[test_batch_id:test_batch_id + batch_size])
                    error_sum += error_i
                    batch_sent_labels = test_sent_labels[
                        test_batch_id:test_batch_id + batch_size]
                    batch_sent_names = test_sent_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_names = test_ground_names[
                        test_batch_id:test_batch_id + batch_size]
                    batch_ground_labels = test_labels[
                        test_batch_id:test_batch_id + batch_size]
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(
                            batch_ground_labels[i])
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        pred_sent_names = []
                        gold_sent_names = batch_ground_names[i]
                        zipped = [(batch_prob[i, k], batch_sent_labels[i][k],
                                   batch_sent_names[i][k])
                                  for k in range(cand_size)]
                        sorted_zip = sorted(zipped,
                                            key=lambda x: x[0],
                                            reverse=True)
                        for j in range(cand_size):
                            triple = sorted_zip[j]
                            if triple[1] == 1.0:
                                '''
                                we should consider a rank, instead of binary
                                if triple[0] >0.5: can control the recall, influence the strict_acc
                                '''
                                if triple[0] > 0.5:
                                    # pred_sent_names.append(batch_sent_names[i][j])
                                    pred_sent_names.append(triple[2])
                                # if len(pred_sent_names) == max_pred_pick:
                                #     break
                        instance_i['predicted_evidence'] = pred_sent_names
                        # print 'pred_sent_names:',pred_sent_names
                        # print 'gold_sent_names:',gold_sent_names
                        new_gold_names = []
                        for gold_name in gold_sent_names:
                            new_gold_names.append([None, None] + gold_name)
                        instance_i['evidence'] = [new_gold_names]
                        predictions.append(instance_i)
                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1
                # test_f1=f1_sum/(len(test_batch_start)*batch_size)

                for test_batch_id in test_3th_batch_start:  # for each test batch
                    _, error_i, pred_i = test_model(
                        test_3th_sents[test_batch_id:test_batch_id +
                                       batch_size],
                        test_3th_sent_masks[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_sent_labels[test_batch_id:test_batch_id +
                                             batch_size],
                        test_3th_claims[test_batch_id:test_batch_id +
                                        batch_size],
                        test_3th_claim_mask[test_batch_id:test_batch_id +
                                            batch_size],
                        test_3th_labels[test_batch_id:test_batch_id +
                                        batch_size])
                    for i in range(batch_size):
                        instance_i = {}
                        instance_i['label'] = pred_id2label.get(2)
                        instance_i['predicted_label'] = pred_id2label.get(
                            pred_i[i])
                        instance_i['predicted_evidence'] = []
                        instance_i['evidence'] = []
                        predictions.append(instance_i)

                strict_score, label_accuracy, precision, recall, f1 = fever_score(
                    predictions)
                print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1

        print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min'
        mid_time = time.time()

        #print 'Batch_size: ', update_freq
    end_time = time.time()

    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    return max_acc_test
Ejemplo n.º 29
0
    def __init__(self, rng, input, n_in, n_hidden, n_out):
        """Initialize the parameters for the multilayer perceptron

        :type rng: numpy.random.RandomState
        :param rng: a random number generator used to initialize weights

        :type input: theano.tensor.TensorType
        :param input: symbolic variable that describes the input of the
        architecture (one minibatch)

        :type n_in: int
        :param n_in: number of input units, the dimension of the space in
        which the datapoints lie

        :type n_hidden: int
        :param n_hidden: number of hidden units

        :type n_out: int
        :param n_out: number of output units, the dimension of the space in
        which the labels lie

        """

        # Since we are dealing with a one hidden layer MLP, this will translate
        # into a HiddenLayer with a tanh activation function connected to the
        # LogisticRegression layer; the activation function can be replaced by
        # sigmoid or any other nonlinear function
        self.hiddenLayer = HiddenLayer(
            rng=rng,
            input=input,
            n_in=n_in,
            n_out=n_hidden,
            activation=T.tanh
        )

        # The logistic regression layer gets as input the hidden units
        # of the hidden layer
        self.logRegressionLayer = LogisticRegression(
            input=self.hiddenLayer.output,
            n_in=n_hidden,
            n_out=n_out
        )
        # end-snippet-2 start-snippet-3
        # L1 norm ; one regularization option is to enforce L1 norm to
        # be small
        self.L1 = (
            abs(self.hiddenLayer.W).sum()
            + abs(self.logRegressionLayer.W).sum()
        )

        # square of L2 norm ; one regularization option is to enforce
        # square of L2 norm to be small
        self.L2_sqr = (
            (self.hiddenLayer.W ** 2).sum()
            + (self.logRegressionLayer.W ** 2).sum()
        )

        # negative log likelihood of the MLP is given by the negative
        # log likelihood of the output of the model, computed in the
        # logistic regression layer
        self.negative_log_likelihood = (
            self.logRegressionLayer.negative_log_likelihood
        )
        # same holds for the function computing the number of errors
        self.errors = self.logRegressionLayer.errors

        # the parameters of the model are the parameters of the two layer it is
        # made out of
        self.params = self.hiddenLayer.params + self.logRegressionLayer.params
        # end-snippet-3

        # keep track of model input
        self.input = input
Ejemplo n.º 30
0
    def __init__(self,
                 rng,
                 model_input,
                 image_shape=(3, 32, 32),
                 filter_shape=(5, 5),
                 poolsize=(2, 2),
                 batch_size=100,
                 nkerns=(20, 50),
                 n_in=400,
                 n_out=10,
                 temperature=1,
                 dropout_ps=[0.0, 0.0, 0.0, 0.0]):
        layer0_input = model_input.reshape((batch_size, ) + image_shape)
        layer0_input_dropout = _dropout_from_layer(rng,
                                                   layer0_input,
                                                   p=dropout_ps[0])

        self.layer0_dropout = DropoutLenetConvPoolLayer(
            rng,
            input=layer0_input_dropout,
            image_shape=(batch_size, ) + image_shape,
            filter_shape=(nkerns[0], ) + (image_shape[0], ) + filter_shape,
            poolsize=poolsize,
            dropout_p=dropout_ps[1])
        self.layer0 = LeNetConvPoolLayer(
            rng,
            input=layer0_input,
            W=self.layer0_dropout.W * (1 - dropout_ps[0]),
            b=self.layer0_dropout.b,
            image_shape=(batch_size, ) + image_shape,
            filter_shape=(nkerns[0], ) + (image_shape[0], ) + filter_shape,
            poolsize=poolsize,
        )

        self.layer1_dropout = DropoutLenetConvPoolLayer(
            rng,
            input=self.layer0_dropout.output,
            image_shape=(batch_size, ) + (nkerns[0], ) + (14, 14),
            filter_shape=(nkerns[1], nkerns[0]) + (5, 5),
            poolsize=poolsize,
            dropout_p=dropout_ps[2],
        )
        self.layer1 = LeNetConvPoolLayer(
            rng,
            input=self.layer0.output,
            W=self.layer1_dropout.W * (1 - dropout_ps[1]),
            b=self.layer1_dropout.b,
            image_shape=(batch_size, ) + (nkerns[0], ) + (14, 14),
            filter_shape=(nkerns[1], nkerns[0]) + (5, 5),
            poolsize=poolsize,
        )

        self.layer2_dropout = DropoutHiddenLayer(
            rng,
            input=self.layer1_dropout.output.flatten(2),
            n_in=nkerns[1] * 5 * 5,
            n_out=n_in,
            activation=T.tanh,
            dropout_p=dropout_ps[3],
        )
        self.layer2 = HiddenLayer(
            rng,
            input=self.layer1.output.flatten(2),
            W=self.layer2_dropout.W * (1 - dropout_ps[2]),
            b=self.layer2_dropout.b,
            n_in=nkerns[1] * 5 * 5,
            n_out=n_in,
            activation=T.tanh,
        )

        self.logRegressionLayer_dropout = LogisticRegression(
            input=self.layer2_dropout.output,
            n_in=n_in,
            n_out=n_out,
            temperature=temperature)
        self.logRegressionLayer = LogisticRegression(
            input=self.layer2.output,
            W=self.logRegressionLayer_dropout.W * (1 - dropout_ps[3]),
            b=self.logRegressionLayer_dropout.b,
            n_in=n_in,
            n_out=n_out,
            temperature=temperature)

        # self.L1 = (
        #     abs(self.layer1_dropout.W).sum()
        #     + abs(self.layer2_dropout.W).sum()
        #     + abs(self.logRegressionLayer.W_dropout).sum()
        # )
        #
        # self.L2_sqr = (self.layer1.W ** 2).sum() + \
        #               (self.layer2.W ** 2).sum() + \
        #               (self.logRegressionLayer.W ** 2).sum()

        self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood
        self.negative_log_likelihood_dropout = self.logRegressionLayer_dropout.negative_log_likelihood

        self.errors = self.logRegressionLayer.errors
        self.errors_dropout = self.logRegressionLayer_dropout.errors

        self.params = (self.logRegressionLayer_dropout.params +
                       self.layer2_dropout.params +
                       self.layer1_dropout.params + self.layer0_dropout.params)

        self.p_y_given_x = self.logRegressionLayer.p_y_given_x
        self.p_y_given_x_relaxed = self.logRegressionLayer.p_y_given_x_relaxed