def evaluate_lenet5(claim): learning_rate=0.02 n_epochs=100 emb_size=300 batch_size=1#50 filter_size=[3] sent_len=40 claim_len=40 cand_size=10 hidden_size=[300,300] max_pred_pick=5 model_options = locals().copy() print("model options", model_options) pred_id2label = {1:'SUPPORTS', 0:'REFUTES', 2:'NOT ENOUGH INFO'} seed=1234 np.random.seed(seed) rng = np.random.RandomState(seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) word2id = {} read_word2id = codecs.open('/home/wyin3/workspace/FEVER/src/word2id.txt', 'r', 'utf-8') for line in read_word2id: parts = line.strip().split() word2id[parts[0]] = int(parts[1]) print('word2id load over, size:', len(word2id)) read_word2id.close() claim_idlist, claim_masklist, sent_ins_ids, sent_ins_mask, sent_cand_list = claim_input_2_theano_input(claim, word2id, claim_len, sent_len, cand_size) test_claims=np.asarray([claim_idlist], dtype='int32') test_claim_mask=np.asarray([claim_masklist], dtype=theano.config.floatX) test_sents=np.asarray([sent_ins_ids], dtype='int32') test_sent_masks=np.asarray([sent_ins_mask], dtype=theano.config.floatX) vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution # id2word = {y:x for x,y in word2id.items()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable ''' the first block for evidence identification in two classes (support & reject) the second block for textual entailment: given evidence labels, predict the claim labels ''' sents_ids=T.itensor3() #(batch, cand_size, sent_len) sents_mask=T.ftensor3() # sents_labels=T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.fmatrix() # joint_sents_ids=T.itensor3() #(batch, cand_size, sent_len) # joint_sents_mask=T.ftensor3() # # joint_sents_labels=T.imatrix() #(batch, cand_size) # joint_claim_ids = T.imatrix() #(batch, claim_len) # joint_claim_mask = T.fmatrix() # joint_labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') embed_input_sents=init_embeddings[sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim=init_embeddings[claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1) "shared parameters" conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) "tasl 1 parameters" task1_att_conv_W, task1_att_conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) task1_conv_W_context, task1_conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) "task 2 parameters" att_conv_W, att_conv_b=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context=create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para=[conv_W, conv_b, task1_att_conv_W, task1_att_conv_b, att_conv_W, att_conv_b,task1_conv_W_context,conv_W_context] conv_model_sents = Conv_with_Mask(rng, input_tensor3=embed_input_sents, mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])), image_shape=(batch_size*cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings=conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_sent_emb = sent_embeddings.reshape((batch_size, cand_size, hidden_size[0])) conv_model_claims = Conv_with_Mask(rng, input_tensor3=embed_input_claim, mask_matrix = claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero claim_embeddings=conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0,'x', 1), cand_size, axis=1) ''' attentive conv for task1 ''' task1_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(rng, input_tensor3=embed_input_sents, #batch_size*cand_size, emb_size, sent_len input_tensor3_r = T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])), mask_matrix_r = T.repeat(claim_mask,cand_size, axis=0), image_shape=(batch_size*cand_size, 1, emb_size, sent_len), image_shape_r = (batch_size*cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1,emb_size, 1), W=task1_att_conv_W, b=task1_att_conv_b, W_context=task1_conv_W_context, b_context=task1_conv_b_context) task1_attentive_sent_embeddings_l = task1_attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) task1_attentive_sent_embeddings_r = task1_attentive_conv_layer.attentive_maxpool_vec_r concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb, T.sum(batch_claim_emb*batch_sent_emb, axis=2).dimshuffle(0,1,'x')], axis=2) concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2+1)) "to score each evidence sentence, we use the output of attentiveConv, as well as the output of standard CNN" LR_input = T.concatenate([concate_2_matrix, task1_attentive_sent_embeddings_l,task1_attentive_sent_embeddings_r], axis=1) LR_input_size = hidden_size[0]*2+1 + hidden_size[0]*2 # LR_input = concate_2_matrix # LR_input_size = hidden_size[0]*2+1 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para(rng, 1, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(LR_input.dot(U_a)) #batch * 12 inter_matrix = score_matrix.reshape((batch_size, cand_size)) # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1) # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size))) ''' maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix) ''' binarize_prob = T.where( inter_matrix > 0.5, 1, 0) #(batch_size, cand_size) sents_labels = inter_matrix*binarize_prob ''' training task2, predict 3 labels ''' # joint_embed_input_sents=init_embeddings[joint_sents_ids.flatten()].reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM # joint_embed_input_claim=init_embeddings[joint_claim_ids.flatten()].reshape((batch_size,claim_len, emb_size)).dimshuffle(0,2,1) # joint_conv_model_sents = Conv_with_Mask(rng, input_tensor3=joint_embed_input_sents, # mask_matrix = joint_sents_mask.reshape((joint_sents_mask.shape[0]*joint_sents_mask.shape[1],joint_sents_mask.shape[2])), # image_shape=(batch_size*cand_size, 1, emb_size, sent_len), # filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero # joint_sent_embeddings=joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size # joint_batch_sent_emb = joint_sent_embeddings.reshape((batch_size, cand_size, hidden_size[0])) # "??? use joint_sents_labels means the evidence labels are not provided by task 1?" # joint_premise_emb = T.sum(joint_batch_sent_emb*joint_sents_labels.dimshuffle(0,1,'x'), axis=1) #(batch, hidden_size) premise_emb = T.sum(batch_sent_emb*sents_labels.dimshuffle(0,1,'x'), axis=1) # joint_conv_model_claims = Conv_with_Mask(rng, input_tensor3=joint_embed_input_claim, # mask_matrix = joint_claim_mask, # image_shape=(batch_size, 1, emb_size, claim_len), # filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b) #mutiple mask with the conv_out to set the features by UNK to zero # joint_claim_embeddings=joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size premise_hypo_emb = T.concatenate([premise_emb,claim_embeddings], axis=1) #(batch, 2*hidden_size) ''' attentive conv in task2 ''' sents_tensor3 = embed_input_sents.dimshuffle(0,2,1).reshape((batch_size, cand_size*sent_len, emb_size)) sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(0,2,1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) sents_dot_2_matrix = T.nnet.softmax(sents_dot.reshape((batch_size*cand_size*sent_len, cand_size*sent_len))) sents_context = T.batched_dot(sents_dot_2_matrix.reshape((batch_size, cand_size*sent_len, cand_size*sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) add_sents_context = embed_input_sents+sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#T.concatenate([joint_embed_input_sents, joint_sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(rng, input_tensor3=add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len input_tensor3_r = T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])), mask_matrix_r = T.repeat(claim_mask,cand_size, axis=0), image_shape=(batch_size*cand_size, 1, emb_size, sent_len), image_shape_r = (batch_size*cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1,emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l.reshape((batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r.reshape((batch_size, cand_size, hidden_size[0])) masked_sents_attconv = attentive_sent_embeddings_l*sents_labels.dimshuffle(0,1,'x') masked_claim_attconv = attentive_sent_embeddings_r*sents_labels.dimshuffle(0,1,'x') fine_max = T.concatenate([T.max(masked_sents_attconv, axis=1),T.max(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) # fine_sum = T.concatenate([T.sum(masked_sents_attconv, axis=1),T.sum(masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) "Logistic Regression layer" joint_LR_input = T.concatenate([premise_hypo_emb,fine_max], axis=1) joint_LR_input_size=2*hidden_size[0]+2*hidden_size[0] joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3) joint_LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class joint_LR_para=[joint_U_a, joint_LR_b] joint_layer_LR=LogisticRegression(rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b) #basically it is a multiplication between weight matrix and input feature vector # joint_loss=joint_layer_LR.negative_log_likelihood(joint_labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' # binarize_prob = T.where( inter_matrix > 0.5, 1, 0) #(batch_size, cand_size # masked_inter_matrix = inter_matrix * sents_labels #(batch, cand_size) # test_premise_emb = T.sum(batch_sent_emb*masked_inter_matrix.dimshuffle(0,1,'x'), axis=1) # test_premise_hypo_emb = T.concatenate([test_premise_emb,claim_embeddings], axis=1) # # #fine-maxsum # sents_tensor3 = embed_input_sents.dimshuffle(0,2,1).reshape((batch_size, cand_size*sent_len, emb_size)) # sents_dot = T.batched_dot(sents_tensor3, sents_tensor3.dimshuffle(0,2,1)) #(batch_size, cand_size*sent_len, cand_size*sent_len) # sents_dot_2_matrix = T.nnet.softmax(sents_dot.reshape((batch_size*cand_size*sent_len, cand_size*sent_len))) # sents_context = T.batched_dot(sents_dot_2_matrix.reshape((batch_size, cand_size*sent_len, cand_size*sent_len)), sents_tensor3) #(batch_size, cand_size*sent_len, emb_size) # add_sents_context = embed_input_sents+sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)#T.concatenate([embed_input_sents, sents_context.reshape((batch_size*cand_size, sent_len, emb_size)).dimshuffle(0,2,1)], axis=1) #(batch_size*cand_size, 2*emb_size, sent_len) # # test_attentive_conv_layer = Attentive_Conv_for_Pair_easy_version(rng, # input_tensor3=add_sents_context, #batch_size*cand_size, 2*emb_size, sent_len # input_tensor3_r = T.repeat(embed_input_claim, cand_size, axis=0), # mask_matrix = sents_mask.reshape((sents_mask.shape[0]*sents_mask.shape[1],sents_mask.shape[2])), # mask_matrix_r = T.repeat(claim_mask,cand_size, axis=0), # image_shape=(batch_size*cand_size, 1, emb_size, sent_len), # image_shape_r = (batch_size*cand_size, 1, emb_size, claim_len), # filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), # filter_shape_context=(hidden_size[0], 1,emb_size, 1), # W=att_conv_W, b=att_conv_b, # W_context=conv_W_context, b_context=conv_b_context) # # attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) # # attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r # # test_attentive_sent_embeddings_l = test_attentive_conv_layer.attentive_maxpool_vec_l.reshape((batch_size, cand_size, hidden_size[0])) #(batch_size*cand_size, hidden_size) # test_attentive_sent_embeddings_r = test_attentive_conv_layer.attentive_maxpool_vec_r.reshape((batch_size, cand_size, hidden_size[0])) # test_masked_sents_attconv = test_attentive_sent_embeddings_l*masked_inter_matrix.dimshuffle(0,1,'x') # test_masked_claim_attconv = test_attentive_sent_embeddings_r*masked_inter_matrix.dimshuffle(0,1,'x') # test_fine_max = T.concatenate([T.max(test_masked_sents_attconv, axis=1),T.max(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) # # test_fine_sum = T.concatenate([T.sum(test_masked_sents_attconv, axis=1),T.sum(test_masked_claim_attconv, axis=1)],axis=1) #(batch, 2*hidden) # # # test_LR_input = T.concatenate([test_premise_hypo_emb, test_fine_max], axis=1) # test_LR_input_size = joint_LR_input_size # # test_layer_LR=LogisticRegression(rng, input=test_LR_input, n_in=test_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b) #basically it is a multiplication between weight matrix and input feature vector params = [init_embeddings]+NN_para+LR_para + joint_LR_para # print('initialze model parameters...') # load_model_from_file('/home1/w/wenpeng/workshop/SciTail/src/model_para_0.8120930232558139', params) # train_model = theano.function([sents_ids,sents_mask,sents_labels,claim_ids,claim_mask,joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_ids,sents_mask, claim_ids,claim_mask], [binarize_prob,joint_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_ids,sents_mask, claim_ids,claim_mask], [binarize_prob,joint_layer_LR.y_pred], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters batch_binary_vec, pred_i=test_model( test_sents, test_sent_masks, test_claims, test_claim_mask ) print(batch_binary_vec) print(batch_binary_vec[0]) print(pred_i) print(pred_id2label.get(pred_i[0]))
def evaluate_lenet5(learning_rate=0.01, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 10], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) #Original # datasets = load_data(dataset) # n_out = 10 # Images for face recognition import pickle import Utils_dueo datasets = Utils_dueo.load_pictures() print("Saveing the pickeled data-set") pickle.dump(datasets, open("Dataset_unal_48.p", "wb")) #Attention y is wrong print("Saved the pickeled data-set") #Loading the pickled images #import pickle #datasets = pickle.load(open("Dataset.p", "r")) n_out = 6 batch_size = 20 n_epochs = 2000 # Images for face recognition train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (48, 48) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 48, 48)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 48, 48), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 22, 22), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 9 * 9, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=n_out) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print( (' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # The DBN is an MLP, for which all weights of intermediate layers are shared with a # different RBM. We will first construct the DBN as a deep multilayer perceptron, and # when constructing each sigmoidal layer we also construct an RBM that shares weights # with that layer. During pretraining we will train these RBMs (which will lead # to chainging the weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of the layer below or # the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden layer below or the # input of the DBN if you are on the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are going to only declare that # the parameters of the sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression(\ input = self.sigmoid_layers[-1].output,\ n_in = hidden_layers_sizes[-1], n_out = n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def evaluate_lenet5(train_set, test_set, valid_set, learning_rate=0.1, n_epochs=200, nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :param dataset train_set: dataset to use for training. :param dataset test_set: dataset to use for testing. :param dataset valid_set: dataset to use for validation. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) # create a python generator that returns minibatches one at a time def minibatch_generator(dataset): dataset_x, dataset_y = dataset for i in range(dataset_x.shape[0] // batch_size): start_idx = i * batch_size end_idx = (i + 1) * batch_size batch_x = dataset_x[start_idx:end_idx] batch_y = dataset_y[start_idx:end_idx] yield (batch_x, batch_y) x = T.matrix('x') y = T.lvector('y') ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer(rng, input=x, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function([x, y], layer3.errors(y)) validate_model = theano.function([x, y], layer3.errors(y)) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function([x, y], cost, updates=updates) ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is found # a relative improvement of this much is considered significant improvement_threshold = 0.995 n_train_batches = (train.num_examples + batch_size - 1) // batch_size # go through this many minibatches before checking the network on # the validation set; in this case we check every epoch validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 iter = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 minibatch_index = 0 for minibatch in minibatch_generator(train_set): iter += 1 minibatch_index += 1 if iter % 100 == 0: print('training @ iter = %i' % iter) error = train_model(minibatch[0], minibatch[1]) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(vb[0], vb[1]) for vb in minibatch_generator(valid_set) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(tb[0], tb[1]) for tb in minibatch_generator(test_set) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print('The code ran for %.2fm' % ((end_time - start_time) / 60.), file=sys.stderr)
def evaluate_mnist_1(learning_rate=0.1, n_epochs=100, nkerns=[4, 6], batch_size=2): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(3) xs = [] ys = [] # f = open('temp_value', 'r+') # f = open('out_10', 'r+') f = open('out_10_10', 'r+') while (1): line = f.readline() line2 = f.readline() if not line: break line = line.replace("\n", "") values = [float(i) for i in line.split()] value = float(line2) xs.append(values) ys.append(value) print(len(xs)) print(len(xs[0])) print(len(ys)) # print(ys) # print(xs) test_set_x, test_set_y = shared_dataset([xs, ys]) valid_set_x, valid_set_y = shared_dataset([xs, ys]) train_set_x, train_set_y = shared_dataset([xs, ys]) # train_set_x, train_set_y = datasets[0] # valid_set_x, valid_set_y = datasets[1] # test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing batch_size = len(ys) # batch_size=1 n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # n_train_batches = 1 # n_valid_batches = 1 # n_test_batches = 1 # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # myprint=theano.function([x],x) # myprint([layer2_input]) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=20, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=20, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) prob = layer3.prob_y_given_x(y) f1 = open('weights', 'w+') print "layer 0 weights" for w in layer0.W.get_value(): for r in w: for s in r: for d in s: f1.write(str(d) + '\n') # print layer0.W.get_value() # print layer0.b.get_value() print "layer 1 weights" # print layer1.W.get_value() # print layer1.b.get_value() for w in layer1.W.get_value(): for r in w: for s in r: for d in s: f1.write(str(d) + '\n') print "layer 2 weights" # print layer2.W.get_value() w = layer2.W.get_value() # for d in w: # print d for i in range(len(w[0])): for j in range(len(w)): f1.write(str(w[j][i]) + '\n') # print layer2.b.get_value() # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) prob_model = theano.function( [index], prob, givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) conv_model0 = theano.function( [index], layer0.output, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model0_conv = theano.function( [index], layer0.conv_out, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model1 = theano.function( [index], layer1.output, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model1_conv = theano.function( [index], layer1.conv_out, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model2 = theano.function( [index], layer2.output, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # params = layer0.params + layer1.params + layer2.params + layer3.params # x_printed = theano.printing.Print('this is a very important value')(x) # f_with_print = theano.function([x], x_printed) # f_with_print(layer3.params) # create a list of gradients for all model parameters grads = T.grad(cost, params) val_grads = T.grad(cost, layer3.p_y_given_x) # print "AAAA" # theano.printing.debugprint(temp_grads) # print "AAAA" grad_model = theano.function( [index], grads, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) val_grad_model = theano.function( [index], val_grads, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False bestConvW = layer0.W.get_value() while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index val_grads_ij = val_grad_model(minibatch_index) grads_ij = grad_model(minibatch_index) conv0_ij = conv_model0(minibatch_index) conv1_ij = conv_model1(minibatch_index) conv2_ij = conv_model2(minibatch_index) conv0_conv_ij = conv_model0_conv(minibatch_index) conv1_conv_ij = conv_model1_conv(minibatch_index) print 'training @ iter = ', iter print "last layer var grads" print val_grads_ij[0] # print "Layer 0 convolution" # for c in conv0_conv_ij[0]: # print c # print "" # print "" # print "Layer 1 convolution" # for c in conv1_conv_ij[0]: # print c # print "" # print "" probs = prob_model(minibatch_index) print "Probs" print probs # print "layer 0 grads" # print grads_ij[6] # print grads_ij[7] # print "layer 1 grads" # print grads_ij[4] # print grads_ij[5] # print "layer 2 grads" # print grads_ij[2] # print grads_ij[3] print "log reg layer grads" print grads_ij[0] print grads_ij[1] print "Layer 0 output" # for c in conv0_ij: # for d in c: # print d # print conv0_ij[0][0] print "Layer 1 output" # print conv1_ij[0][0] # for c in conv1_ij: # for d in c: # print d print "Layer 2 output" # for c in conv2_ij: # print c cost_ij = train_model(minibatch_index) # for c in conv0_conv_ij[1]: # print c # print "" print "learning_rate" print learning_rate print "layer 0 weights" # print layer0.W.get_value() # print layer0.b.get_value() print "layer 1 weights" # print layer1.W.get_value() # print layer1.b.get_value() print "layer 2 weights" w = layer2.W.get_value() # print w[0] # print w[1] # for c in layer2.W.get_value(): # print c # print layer2.b.get_value() print "log reg layer weights" print layer3.W.get_value() print layer3.b.get_value() print "COST" print cost_ij if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: bestConvW = layer0.W.get_value() #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print( (' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset=DataSet, nkerns=[20, 50], batch_size=100): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print type(train_set_x) #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540]) #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540]) #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540]) #train_set_x = train_set_x / 100 #valid_set_x = valid_set_x / 100 #test_set_x = test_set_x / 100 # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size #n_test_batches /= batch_size n_test_batches = (n_test_batches / batch_size) + (n_test_batches % batch_size > 0) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (27, 10) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer xinp = x[:, :540] layer0_input = xinp.reshape((batch_size, 2, 27, 10)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 2, 27, 10), filter_shape=(nkerns[0], 2, 5, 2), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 11, 4), filter_shape=(nkerns[1], nkerns[0], 5, 2), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) layer2_inputT = T.concatenate([layer2_input, x[:, 540:]], axis=1) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_inputT, n_in=(nkerns[1] * 3 * 1) + 12, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=n_out) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) #yPred = layer3.ypred(layer2.output) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], [layer3.errors(y), layer3.y_pred], givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size][:], y: train_set_y[index * batch_size:(index + 1) * batch_size][:] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set #tm = test_model(0) yP = numpy.asarray([]) test_losses = [ test_model(i)[0] for i in xrange(n_test_batches) ] for i in xrange(n_test_batches): yP = numpy.concatenate((yP, test_model(i)[1])) print yP.shape test_score = numpy.mean(test_losses) #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value()) y = test_set_y.owner.inputs[0].get_value() I1 = numpy.nonzero(y == 0.0) I2 = numpy.nonzero(y == 1.0) I3 = numpy.nonzero(y == 2.0) I11 = numpy.nonzero(yP[I1[0]] == 0) I12 = numpy.nonzero(yP[I1[0]] == 1) I13 = numpy.nonzero(yP[I1[0]] == 2) I21 = numpy.nonzero(yP[I2[0]] == 0) I22 = numpy.nonzero(yP[I2[0]] == 1) I23 = numpy.nonzero(yP[I2[0]] == 2) I31 = numpy.nonzero(yP[I3[0]] == 0) I32 = numpy.nonzero(yP[I3[0]] == 1) I33 = numpy.nonzero(yP[I3[0]] == 2) acc1 = float(float(I11[0].size) / float(I1[0].size)) acc2 = float(float(I22[0].size) / float(I2[0].size)) if n_out == 3: acc3 = float(float(I33[0].size) / float(I3[0].size)) else: acc3 = 0 print(( ' epoch %i, minibatch %i/%i, test error of ' 'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, I11 = %i, I12 = %i, I13 = %i, I21 = %i, I22 = %i, I23 = %i, I31 = %i, I32 = %i, I33 = %i %%' ) % (epoch, minibatch_index + 1, n_train_batches, test_score * 100., acc1 * 100., acc2 * 100., acc3 * 100, I11[0].size, I12[0].size, I13[0].size, I21[0].size, I22[0].size, I23[0].size, I31[0].size, I32[0].size, I33[0].size)) #print((' epoch %i, minibatch %i/%i, test error of best ' # 'model %f %%') % # (epoch, minibatch_index + 1, n_train_batches, # test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rateOld=0.2, n_epochs=1200, nkerns=[48, 128, 192, 192], batch_size=100): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer 这样默认就是第一次20个kernel,第二层50个kernel? """ global G_params rng = numpy.random.RandomState(23455) learning_rate_decay = 0.998 initial_learning_rate = 1.0 #### the params for momentum mom_start = 0.5 mom_end = 0.99 # for epoch in [0, mom_epoch_interval], the momentum increases linearly # from mom_start to mom_end. After mom_epoch_interval, it stay at mom_end mom_epoch_interval = batch_size * 5 squared_filter_length_limit = 15.0 mom_params = { "start": mom_start, "end": mom_end, "interval": mom_epoch_interval } valid_set_x, valid_set_y = loadValid() test_set_x, test_set_y = loadTest() #just init train_set_x and y data_x, data_y = loadTrainDataWithIndex(1) # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] #hard node now n_train_batches = 100 #默认取第一个作为valid batches n_valid_batches /= batch_size n_test_batches /= batch_size print(n_train_batches) print(n_valid_batches) '''shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=True) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=True)''' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch epoch = T.scalar() x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels learning_rate = theano.shared( numpy.asarray(initial_learning_rate, dtype=theano.config.floatX)) ishape = (50, 50) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 50, 50)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 50, 50), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 23, 23), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(2, 2)) '''layer1_3 = LeNetConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 10, 10), filter_shape=(nkerns[2], nkerns[1], 3, 3), poolsize=(2, 2))''' layer1_3 = LeNetConvPoolLayerNoPooling(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 10, 10), filter_shape=(nkerns[2], nkerns[1], 3, 3)) layer1_4 = LeNetConvPoolLayer(rng, input=layer1_3.output, image_shape=(batch_size, nkerns[2], 8, 8), filter_shape=(nkerns[3], nkerns[2], 3, 3), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1_4.output.flatten(2) # construct a dropout fully-connected sigmoidal layer dropoutlayer2 = DropoutHiddenLayer(rng, input=layer2_input, n_in=nkerns[3] * 3 * 3, n_out=1920, activation=ReLU) # construct a dropout fully-connected sigmoidal layer dropoutlayer2_2 = DropoutHiddenLayer(rng, input=dropoutlayer2.output, n_in=1920, n_out=1920, activation=ReLU) # classify the values of the fully-connected sigmoidal layer dropoutlayer3 = LogisticRegression(input=dropoutlayer2_2.output, n_in=1920, n_out=58) # the cost we minimize during training is the NLL of the model dropoutcost = dropoutlayer3.negative_log_likelihood(y) # construct a dropout fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[3] * 3 * 3, n_out=1920, activation=ReLU, W=dropoutlayer2.W * 0.5, b=dropoutlayer2.b) # construct a dropout fully-connected sigmoidal layer layer2_2 = HiddenLayer(rng, input=layer2.output, n_in=1920, n_out=1920, activation=ReLU, W=dropoutlayer2_2.W * 0.5, b=dropoutlayer2_2.b) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2_2.output, n_in=1920, n_out=58) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + dropoutlayer2_2.params + dropoutlayer2.params + layer1_4.params + layer1_3.params + layer1.params + layer0.params # Compute gradients of the model wrt parameters gparams = [] for param in params: # Use the right cost function here to train with or without dropout. gparam = T.grad(cost, param) gparams.append(gparam) # ... and allocate mmeory for momentum'd versions of the gradient gparams_mom = [] for param in params: gparam_mom = theano.shared( numpy.zeros(param.get_value(borrow=True).shape, dtype=theano.config.floatX)) gparams_mom.append(gparam_mom) # Compute momentum for the current epoch mom = ifelse( epoch < mom_epoch_interval, mom_start * (1.0 - epoch / mom_epoch_interval) + mom_end * (epoch / mom_epoch_interval), mom_end) # Update the step direction using momentum updates = OrderedDict() for gparam_mom, gparam in zip(gparams_mom, gparams): # Misha Denil's original version #updates[gparam_mom] = mom * gparam_mom + (1. - mom) * gparam # change the update rule to match Hinton's dropout paper updates[gparam_mom] = mom * gparam_mom - (1. - mom) * learning_rate * gparam # ... and take a step along that direction for param, gparam_mom in zip(params, gparams_mom): # Misha Denil's original version #stepped_param = param - learning_rate * updates[gparam_mom] # since we have included learning_rate in gparam_mom, we don't need it # here stepped_param = param + updates[gparam_mom] # This is a silly hack to constrain the norms of the rows of the weight # matrices. This just checks if there are two dimensions to the # parameter and constrains it if so... maybe this is a bit silly but it # should work for now. if param.get_value(borrow=True).ndim == 2: #squared_norms = T.sum(stepped_param**2, axis=1).reshape((stepped_param.shape[0],1)) #scale = T.clip(T.sqrt(squared_filter_length_limit / squared_norms), 0., 1.) #updates[param] = stepped_param * scale # constrain the norms of the COLUMNs of the weight, according to # https://github.com/BVLC/caffe/issues/109 col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) desired_norms = T.clip(col_norms, 0, T.sqrt(squared_filter_length_limit)) scale = desired_norms / (1e-7 + col_norms) updates[param] = stepped_param * scale else: updates[param] = stepped_param G_params = params # create a list of gradients for all model parameters #grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. #updates = [] #for param_i, grad_i in zip(params, grads): # updates.append((param_i, param_i - learning_rate * grad_i)) '''train_model = theano.function([epoch], dropoutcost, updates=updates, givens={ x: shared_x, y: T.cast(shared_y, 'int32')})''' '''train_model = theano.function([epoch, index], dropoutcost, updates=updates, givens={ x: shared_x[index * batch_size: (index + 1) * batch_size], y: T.cast(shared_y, 'int32')[index * batch_size: (index + 1) * batch_size]})''' decay_learning_rate = theano.function( inputs=[], outputs=learning_rate, updates={learning_rate: learning_rate * learning_rate_decay}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 1000000 # look as this many examples regardless patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch_counter = 0 done_looping = False while (epoch_counter < n_epochs) and (not done_looping): epoch_counter = epoch_counter + 1 for j in range(2): data_x, data_y = loadTrainDataWithIndex(j + 1) shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=True) shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=True) train_model = theano.function( [epoch, index], dropoutcost, updates=updates, givens={ x: shared_x[index * batch_size:(index + 1) * batch_size], y: T.cast(shared_y, 'int32')[index * batch_size:(index + 1) * batch_size] }) for minibatch_index in xrange(n_train_batches): cost_ij = train_model(epoch_counter, minibatch_index) # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, validation error %f %%' % \ (epoch_counter, \ this_validation_loss * 100.)) test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, test error of best ' 'model %f %%') % (epoch_counter, test_score * 100.)) saveParams(epoch_counter, params) # if we got the best validation score until now if this_validation_loss < best_validation_loss: best_validation_loss = this_validation_loss best_iter = iter best_params = params new_learning_rate = decay_learning_rate() print("New learning rate:" + str(new_learning_rate)) end_time = time.clock() print('Optimization complete.') saveParams(1000, params) print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(sigma=0.01, learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type sigma: float :param sigma: standard deviation in normal distribution :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(930508) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. # Parameterizing n_feature = train_set_x.get_value().shape[1] matrix_dim = numpy.sqrt(n_feature) matrix_dim = matrix_dim.astype('int8') layer0_input = x.reshape((batch_size, 1, matrix_dim, matrix_dim)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, matrix_dim, matrix_dim), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (nkerns[0], nkerns[1], 4, 4) temp1 = (matrix_dim - 5 + 1) / 2 layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], temp1, temp1), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) temp2 = (temp1 - 5 + 1) / 2 ### This is a good place to add noise ### srng = RandomStreams(seed=508) variation = srng.normal((temp2 * temp2 * 50, ), 0, sigma) layer2_input += variation ### end ### # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * temp2 * temp2, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer n_out = max(train_set_y.eval()) - min(train_set_y.eval()) + 1 # print n_out layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=n_out) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index # if iter % 10 == 0: # print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) '''print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.))''' # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) '''print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.))''' print test_score * 100. if patience <= iter: done_looping = True break end_time = time.clock() '''print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.))a''' print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, numpy_rng=None, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1]): #save for raw_dump self.n_ins = n_ins self.hidden_layers_sizes = hidden_layers_sizes self.n_outs = n_outs self.corruption_levels = corruption_levels self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not numpy_rng: numpy_rng = numpy.random.RandomState(numpy.random.randint(2 ** 30)) if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels for i in xrange(self.n_layers): if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, #shared weight bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def evaluate_lenet5(learning_rate=0.0001, n_epochs=2000, nkerns=[256, 256], batch_size=1, window_width=[4, 4], maxSentLength=64, emb_size=300, hidden_size=200, margin=0.5, L2_weight=0.0006, Div_reg=0.06, update_freq=1, norm_threshold=5.0, max_truncate=40): maxSentLength = max_truncate + 2 * (window_width[0] - 1) model_options = locals().copy() print "model options", model_options rootPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/' rng = numpy.random.RandomState(23455) datasets, vocab_size = load_wikiQA_corpus( rootPath + 'vocab.txt', rootPath + 'WikiQA-train.txt', rootPath + 'test_filtered.txt', max_truncate, maxSentLength) #vocab_size contain train, dev and test #datasets, vocab_size=load_wikiQA_corpus(rootPath+'vocab_lower_in_word2vec.txt', rootPath+'WikiQA-train.txt', rootPath+'test_filtered.txt', maxSentLength)#vocab_size contain train, dev and test mtPath = '/mounts/data/proj/wenpeng/Dataset/WikiQACorpus/MT/BLEU_NIST/' mt_train, mt_test = load_mts_wikiQA( mtPath + 'result_train/concate_2mt_train.txt', mtPath + 'result_test/concate_2mt_test.txt') wm_train, wm_test = load_wmf_wikiQA( rootPath + 'train_word_matching_scores.txt', rootPath + 'test_word_matching_scores.txt') #wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_word_matching_scores_normalized.txt', rootPath+'test_word_matching_scores_normalized.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad = datasets[ 0] indices_train_l = indices_train[::2, :] indices_train_r = indices_train[1::2, :] trainLengths_l = trainLengths[::2] trainLengths_r = trainLengths[1::2] normalized_train_length_l = normalized_train_length[::2] normalized_train_length_r = normalized_train_length[1::2] trainLeftPad_l = trainLeftPad[::2] trainLeftPad_r = trainLeftPad[1::2] trainRightPad_l = trainRightPad[::2] trainRightPad_r = trainRightPad[1::2] indices_test, testY, testLengths, normalized_test_length, testLeftPad, testRightPad = datasets[ 1] indices_test_l = indices_test[::2, :] indices_test_r = indices_test[1::2, :] testLengths_l = testLengths[::2] testLengths_r = testLengths[1::2] normalized_test_length_l = normalized_test_length[::2] normalized_test_length_r = normalized_test_length[1::2] testLeftPad_l = testLeftPad[::2] testLeftPad_r = testLeftPad[1::2] testRightPad_l = testRightPad[::2] testRightPad_r = testRightPad[1::2] n_train_batches = indices_train_l.shape[0] / batch_size n_test_batches = indices_test_l.shape[0] / batch_size train_batch_start = list(numpy.arange(n_train_batches) * batch_size) test_batch_start = list(numpy.arange(n_test_batches) * batch_size) indices_train_l = theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) indices_train_r = theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) indices_test_l = theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) indices_test_r = theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) indices_train_l = T.cast(indices_train_l, 'int64') indices_train_r = T.cast(indices_train_r, 'int64') indices_test_l = T.cast(indices_test_l, 'int64') indices_test_r = T.cast(indices_test_r, 'int64') rand_values = random_value_normal((vocab_size + 1, emb_size), theano.config.floatX, numpy.random.RandomState(1234)) rand_values[0] = numpy.array(numpy.zeros(emb_size), dtype=theano.config.floatX) #rand_values[0]=numpy.array([1e-50]*emb_size) rand_values = load_word2vec_to_init(rand_values, rootPath + 'vocab_embs_300d.txt') embeddings = theano.shared(value=rand_values, borrow=True) #cost_tmp=0 error_sum = 0 # allocate symbolic variables for the data index = T.lscalar() x_index_l = T.lmatrix( 'x_index_l') # now, x is the index matrix, must be integer x_index_r = T.lmatrix('x_index_r') y = T.lvector('y') left_l = T.lscalar() right_l = T.lscalar() left_r = T.lscalar() right_r = T.lscalar() length_l = T.lscalar() length_r = T.lscalar() norm_length_l = T.dscalar() norm_length_r = T.dscalar() mts = T.dmatrix() wmf = T.dmatrix() cost_tmp = T.dscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size = (emb_size, window_width[0]) filter_size_2 = (nkerns[0], window_width[1]) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv = ishape[1] + filter_size[1] - 1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape( (maxSentLength, emb_size)).transpose() layer0_r_input = embeddings[x_index_r.flatten()].reshape( (maxSentLength, emb_size)).transpose() l_input_tensor = debug_print( Matrix_Bit_Shift(layer0_l_input[:, left_l:-right_l]), 'l_input_tensor') r_input_tensor = debug_print( Matrix_Bit_Shift(layer0_r_input[:, left_r:-right_r]), 'r_input_tensor') addition_l = T.sum(layer0_l_input[:, left_l:-right_l], axis=1) addition_r = T.sum(layer0_r_input[:, left_r:-right_r], axis=1) cosine_addition = cosine(addition_l, addition_r) eucli_addition = 1.0 / (1.0 + EUCLID(addition_l, addition_r)) #25.2% U, W, b = create_GRU_para(rng, emb_size, nkerns[0]) layer0_para = [U, W, b] layer0_A1 = GRU_Batch_Tensor_Input(X=l_input_tensor, hidden_dim=nkerns[0], U=U, W=W, b=b, bptt_truncate=-1) layer0_A2 = GRU_Batch_Tensor_Input(X=r_input_tensor, hidden_dim=nkerns[0], U=U, W=W, b=b, bptt_truncate=-1) cosine_sent = cosine(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep) eucli_sent = 1.0 / (1.0 + EUCLID(layer0_A1.output_sent_rep, layer0_A2.output_sent_rep)) #25.2% #ibm attentive pooling at extended sentence level attention_matrix = compute_simi_feature_matrix_with_matrix( layer0_A1.output_matrix, layer0_A2.output_matrix, layer0_A1.dim, layer0_A2.dim, maxSentLength * (maxSentLength + 1) / 2) # attention_vec_l_extended=T.nnet.softmax(T.max(attention_matrix, axis=1)).transpose() # ibm_l_extended=layer0_A1.output_matrix.dot(attention_vec_l_extended).transpose() # attention_vec_r_extended=T.nnet.softmax(T.max(attention_matrix, axis=0)).transpose() # ibm_r_extended=layer0_A2.output_matrix.dot(attention_vec_r_extended).transpose() # cosine_ibm_extended=cosine(ibm_l_extended, ibm_r_extended) # eucli_ibm_extended=1.0/(1.0+EUCLID(ibm_l_extended, ibm_r_extended))#25.2% #ibm attentive pooling at original sentence level simi_matrix_sent = compute_simi_feature_matrix_with_matrix( layer0_A1.output_sent_hiddenstates, layer0_A2.output_sent_hiddenstates, length_l, length_r, maxSentLength) attention_vec_l = T.nnet.softmax(T.max(simi_matrix_sent, axis=1)).transpose() ibm_l = layer0_A1.output_sent_hiddenstates.dot(attention_vec_l).transpose() attention_vec_r = T.nnet.softmax(T.max(simi_matrix_sent, axis=0)).transpose() ibm_r = layer0_A2.output_sent_hiddenstates.dot(attention_vec_r).transpose() cosine_ibm = cosine(ibm_l, ibm_r) eucli_ibm = 1.0 / (1.0 + EUCLID(ibm_l, ibm_r)) #25.2% l_max_attention = T.max(attention_matrix, axis=1) neighborsArgSorted = T.argsort(l_max_attention) kNeighborsArg = neighborsArgSorted[-3:] #only average the max 3 vectors ll = T.sort(kNeighborsArg).flatten() # make y indices in acending lie r_max_attention = T.max(attention_matrix, axis=0) neighborsArgSorted_r = T.argsort(r_max_attention) kNeighborsArg_r = neighborsArgSorted_r[ -3:] #only average the max 3 vectors rr = T.sort(kNeighborsArg_r).flatten() # make y indices in acending lie l_max_min_attention = debug_print(layer0_A1.output_matrix[:, ll], 'l_max_min_attention') r_max_min_attention = debug_print(layer0_A2.output_matrix[:, rr], 'r_max_min_attention') U1, W1, b1 = create_GRU_para(rng, nkerns[0], nkerns[1]) layer1_para = [U1, W1, b1] layer1_A1 = GRU_Matrix_Input(X=l_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1], U=U1, W=W1, b=b1, bptt_truncate=-1) layer1_A2 = GRU_Matrix_Input(X=r_max_min_attention, word_dim=nkerns[0], hidden_dim=nkerns[1], U=U1, W=W1, b=b1, bptt_truncate=-1) vec_l = debug_print(layer1_A1.output_vector_last.reshape((1, nkerns[1])), 'vec_l') vec_r = debug_print(layer1_A2.output_vector_last.reshape((1, nkerns[1])), 'vec_r') # sum_uni_l=T.sum(layer0_l_input, axis=3).reshape((1, emb_size)) # aver_uni_l=sum_uni_l/layer0_l_input.shape[3] # norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) # sum_uni_r=T.sum(layer0_r_input, axis=3).reshape((1, emb_size)) # aver_uni_r=sum_uni_r/layer0_r_input.shape[3] # norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) # uni_cosine = cosine(vec_l, vec_r) # aver_uni_cosine=cosine(aver_uni_l, aver_uni_r) # uni_sigmoid_simi=debug_print(T.nnet.sigmoid(T.dot(norm_uni_l, norm_uni_r.T)).reshape((1,1)),'uni_sigmoid_simi') # ''' # linear=Linear(sum_uni_l, sum_uni_r) # poly=Poly(sum_uni_l, sum_uni_r) # sigmoid=Sigmoid(sum_uni_l, sum_uni_r) # rbf=RBF(sum_uni_l, sum_uni_r) # gesd=GESD(sum_uni_l, sum_uni_r) # ''' eucli_1 = 1.0 / (1.0 + EUCLID(vec_l, vec_r)) #25.2% # #eucli_1_exp=1.0/T.exp(EUCLID(sum_uni_l, sum_uni_r)) # len_l = norm_length_l.reshape((1, 1)) len_r = norm_length_r.reshape((1, 1)) # # ''' # len_l=length_l.reshape((1,1)) # len_r=length_r.reshape((1,1)) # ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts layer3_input = T.concatenate( [ vec_l, vec_r, uni_cosine, eucli_1, cosine_addition, eucli_addition, # cosine_sent, eucli_sent, ibm_l.reshape((1, nkerns[0])), ibm_r.reshape((1, nkerns[0])), #2*nkerns[0]+ cosine_ibm, eucli_ibm, len_l, len_r, wmf ], axis=1) #, layer2.output, layer1.output_cosine], axis=1) #layer3_input=T.concatenate([mts,eucli, uni_cosine, len_l, len_r, norm_uni_l-(norm_uni_l+norm_uni_r)/2], axis=1) #layer3=LogisticRegression(rng, input=layer3_input, n_in=11, n_out=2) layer3 = LogisticRegression(rng, input=layer3_input, n_in=(2 * nkerns[1] + 2) + 2 + (2 * nkerns[0] + 2) + 2 + 2, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg = debug_print( (layer3.W**2).sum() + (U**2).sum() + (W**2).sum() + (U1**2).sum() + (W1**2).sum(), 'L2_reg' ) #+(conv_W** 2).sum()+(layer1.W** 2).sum()++(embeddings**2).sum() diversify_reg = Diversify_Reg(layer3.W.T) + Diversify_Reg( U[0]) + Diversify_Reg(W[0]) + Diversify_Reg(U1[0]) + Diversify_Reg( W1[0]) + Diversify_Reg(U[1]) + Diversify_Reg(W[1]) + Diversify_Reg( U1[1]) + Diversify_Reg(W1[1]) + Diversify_Reg( U[2]) + Diversify_Reg(W[2]) + Diversify_Reg( U1[2]) + Diversify_Reg(W1[2]) cost_this = debug_print(layer3.negative_log_likelihood(y), 'cost_this') #+L2_weight*L2_reg cost = debug_print((cost_this + cost_tmp) / update_freq + L2_weight * L2_reg + Div_reg * diversify_reg, 'cost') #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost') test_model = theano.function( [index], [layer3.prop_for_posi, layer3_input, y], givens={ x_index_l: indices_test_l[index:index + batch_size], x_index_r: indices_test_r[index:index + batch_size], y: testY[index:index + batch_size], left_l: testLeftPad_l[index], right_l: testRightPad_l[index], left_r: testLeftPad_r[index], right_r: testRightPad_r[index], length_l: testLengths_l[index], length_r: testLengths_r[index], norm_length_l: normalized_test_length_l[index], norm_length_r: normalized_test_length_r[index], mts: mt_test[index:index + batch_size], wmf: wm_test[index:index + batch_size] }, on_unused_input='ignore') #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b] params = layer3.params + layer1_para + layer0_para #+[embeddings]# + layer1.params # params_conv = [conv_W, conv_b] # accumulator=[] # for para_i in params: # eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) # accumulator.append(theano.shared(eps_p, borrow=True)) # # # create a list of gradients for all model parameters # grads = T.grad(cost, params) # # updates = [] # for param_i, grad_i, acc_i in zip(params, grads, accumulator): # grad_i=debug_print(grad_i,'grad_i') # acc = acc_i + T.sqr(grad_i) # updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc))) #AdaGrad # updates.append((acc_i, acc)) def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8): updates = [] grads = T.grad(cost, params) i = theano.shared(numpy.float64(0.)) i_t = i + 1. fix1 = 1. - (1. - b1)**i_t fix2 = 1. - (1. - b2)**i_t lr_t = lr * (T.sqrt(fix2) / fix1) for p, g in zip(params, grads): m = theano.shared(p.get_value() * 0.) v = theano.shared(p.get_value() * 0.) m_t = (b1 * g) + ((1. - b1) * m) v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) g_t = m_t / (T.sqrt(v_t) + e) p_t = p - (lr_t * g_t) updates.append((m, m_t)) updates.append((v, v_t)) updates.append((p, p_t)) updates.append((i, i_t)) return updates updates = Adam(cost=cost, params=params, lr=learning_rate) train_model = theano.function( [index, cost_tmp], cost, updates=updates, givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], wmf: wm_train[index:index + batch_size] }, on_unused_input='ignore') train_model_predict = theano.function( [index], [cost_this, layer3.errors(y), layer3_input, y], givens={ x_index_l: indices_train_l[index:index + batch_size], x_index_r: indices_train_r[index:index + batch_size], y: trainY[index:index + batch_size], left_l: trainLeftPad_l[index], right_l: trainRightPad_l[index], left_r: trainLeftPad_r[index], right_r: trainRightPad_r[index], length_l: trainLengths_l[index], length_r: trainLengths_r[index], norm_length_l: normalized_train_length_l[index], norm_length_r: normalized_train_length_r[index], mts: mt_train[index:index + batch_size], wmf: wm_train[index:index + batch_size] }, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.time() mid_time = start_time epoch = 0 done_looping = False svm_max = 0.0 best_epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index = 0 #shuffle(train_batch_start)#shuffle training data cost_tmp = 0.0 for batch_start in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * n_train_batches + minibatch_index + 1 minibatch_index = minibatch_index + 1 #if epoch %2 ==0: # batch_start=batch_start+remain_train #time.sleep(0.5) # print batch_start if iter % update_freq != 0: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) #print 'layer3_input', layer3_input cost_tmp += cost_ij error_sum += error_ij #print 'cost_acc ',cost_acc #print 'cost_ij ', cost_ij #print 'cost_tmp before update',cost_tmp else: cost_average = train_model(batch_start, cost_tmp) #print 'layer3_input', layer3_input error_sum = 0 cost_tmp = 0.0 #reset for the next batch #print 'cost_average ', cost_average #print 'cost_this ',cost_this #exit(0) #exit(0) if iter % n_train_batches == 0: print 'training @ iter = ' + str( iter) + ' average cost: ' + str( cost_average) + ' error: ' + str( error_sum) + '/' + str( update_freq) + ' error rate: ' + str( error_sum * 1.0 / update_freq) #if iter ==1: # exit(0) if iter % validation_frequency == 0: #write_file=open('log.txt', 'w') test_probs = [] test_y = [] test_features = [] for i in test_batch_start: prob_i, layer3_input, y = test_model(i) #test_losses = [test_model(i) for i in test_batch_start] test_probs.append(prob_i[0][0]) test_y.append(y[0]) test_features.append(layer3_input[0]) MAP, MRR = compute_map_mrr(rootPath + 'test_filtered.txt', test_probs) #now, check MAP and MRR print( ('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test MAP of best ' 'model %f, MRR %f') % (epoch, minibatch_index, n_train_batches, MAP, MRR)) #now, see the results of LR #write_feature=open(rootPath+'feature_check.txt', 'w') train_y = [] train_features = [] count = 0 for batch_start in train_batch_start: cost_ij, error_ij, layer3_input, y = train_model_predict( batch_start) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(str(batch_start)+' '+' '.join(map(str,layer3_input[0]))+'\n') #count+=1 #write_feature.close() clf = svm.SVC(C=1.0, kernel='linear') clf.fit(train_features, train_y) results_svm = clf.decision_function(test_features) MAP_svm, MRR_svm = compute_map_mrr( rootPath + 'test_filtered.txt', results_svm) lr = LinearRegression().fit(train_features, train_y) results_lr = lr.predict(test_features) MAP_lr, MRR_lr = compute_map_mrr( rootPath + 'test_filtered.txt', results_lr) print '\t\t\t\t\t\t\tSVM, MAP: ', MAP_svm, ' MRR: ', MRR_svm, ' LR: ', MAP_lr, ' MRR: ', MRR_lr if patience <= iter: done_looping = True break print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() end_time = time.time() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=4, L2_weight=0.001, emb_size=70, batch_size=50, filter_size=3, maxSentLen=50, nn='CNN'): hidden_size=emb_size model_options = locals().copy() print "model options", model_options rng = np.random.RandomState(1234) #random seed, control the model generates the same results all_sentences_l, all_masks_l, all_sentences_r, all_masks_r,all_labels, word2id =load_SNLI_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_sents_l=np.asarray(all_sentences_l[0], dtype='int32') dev_sents_l=np.asarray(all_sentences_l[1], dtype='int32') test_sents_l=np.asarray(all_sentences_l[2], dtype='int32') train_masks_l=np.asarray(all_masks_l[0], dtype=theano.config.floatX) dev_masks_l=np.asarray(all_masks_l[1], dtype=theano.config.floatX) test_masks_l=np.asarray(all_masks_l[2], dtype=theano.config.floatX) train_sents_r=np.asarray(all_sentences_r[0], dtype='int32') dev_sents_r=np.asarray(all_sentences_r[1] , dtype='int32') test_sents_r=np.asarray(all_sentences_r[2] , dtype='int32') train_masks_r=np.asarray(all_masks_r[0], dtype=theano.config.floatX) dev_masks_r=np.asarray(all_masks_r[1], dtype=theano.config.floatX) test_masks_r=np.asarray(all_masks_r[2], dtype=theano.config.floatX) train_labels_store=np.asarray(all_labels[0], dtype='int32') dev_labels_store=np.asarray(all_labels[1], dtype='int32') test_labels_store=np.asarray(all_labels[2], dtype='int32') train_size=len(train_labels_store) dev_size=len(dev_labels_store) test_size=len(test_labels_store) vocab_size=len(word2id)+1 rand_values=rng.normal(0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution #here, we leave code for loading word2vec to initialize words # rand_values[0]=np.array(np.zeros(emb_size),dtype=theano.config.floatX) # id2word = {y:x for x,y in word2id.iteritems()} # word2vec=load_word2vec() # rand_values=load_word2vec_to_init(rand_values, id2word, word2vec) embeddings=theano.shared(value=np.array(rand_values,dtype=theano.config.floatX), borrow=True) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_ids_l=T.imatrix() sents_mask_l=T.fmatrix() sents_ids_r=T.imatrix() sents_mask_r=T.fmatrix() labels=T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input_l=embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)) #the input format can be adapted into CNN or GRU or LSTM common_input_r=embeddings[sents_ids_r.flatten()].reshape((batch_size,maxSentLen, emb_size)) #conv if nn=='CNN': conv_W, conv_b=create_conv_para(rng, filter_shape=(hidden_size, 1, emb_size, filter_size)) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) NN_para=[conv_W, conv_b] conv_input_l = common_input_l.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_l = Conv_with_input_para(rng, input=conv_input_l, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_l=conv_model_l.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_l=conv_output_l.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_l=T.repeat(sents_mask_l[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_l=(1.0-mask_for_conv_output_l)*(mask_for_conv_output_l-10) masked_conv_output_l=conv_output_into_tensor3_l+mask_for_conv_output_l #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_l=T.max(masked_conv_output_l, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_input_r = common_input_r.dimshuffle((0,'x', 2,1)) #(batch_size, 1, emb_size, maxsenlen) conv_model_r = Conv_with_input_para(rng, input=conv_input_r, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size, 1, emb_size, filter_size), W=conv_W, b=conv_b) conv_output_r=conv_model_r.narrow_conv_out #(batch, 1, hidden_size, maxsenlen-filter_size+1) conv_output_into_tensor3_r=conv_output_r.reshape((batch_size, hidden_size, maxSentLen-filter_size+1)) mask_for_conv_output_r=T.repeat(sents_mask_r[:,filter_size-1:].reshape((batch_size, 1, maxSentLen-filter_size+1)), hidden_size, axis=1) #(batch_size, emb_size, maxSentLen-filter_size+1) mask_for_conv_output_r=(1.0-mask_for_conv_output_r)*(mask_for_conv_output_r-10) masked_conv_output_r=conv_output_into_tensor3_r+mask_for_conv_output_r #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings_r=T.max(masked_conv_output_r, axis=2) #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size #GRU if nn=='GRU': U1, W1, b1=create_GRU_para(rng, emb_size, hidden_size) NN_para=[U1, W1, b1] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias gru_input_l = common_input_l.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_l=GRU_Batch_Tensor_Input_with_Mask(gru_input_l, sents_mask_l, hidden_size, U1, W1, b1) sent_embeddings_l=gru_layer_l.output_sent_rep # (batch_size, hidden_size) gru_input_r = common_input_r.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer_r=GRU_Batch_Tensor_Input_with_Mask(gru_input_r, sents_mask_r, hidden_size, U1, W1, b1) sent_embeddings_r=gru_layer_r.output_sent_rep # (batch_size, hidden_size) #LSTM if nn=='LSTM': LSTM_para_dict=create_LSTM_para(rng, emb_size, hidden_size) NN_para=LSTM_para_dict.values() # .values returns a list of parameters lstm_input_l = common_input_l.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_l=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_l, sents_mask_l, hidden_size, LSTM_para_dict) sent_embeddings_l=lstm_layer_l.output_sent_rep # (batch_size, hidden_size) lstm_input_r = common_input_r.dimshuffle((0,2,1)) #LSTM has the same inpur format with GRU lstm_layer_r=LSTM_Batch_Tensor_Input_with_Mask(lstm_input_r, sents_mask_r, hidden_size, LSTM_para_dict) sent_embeddings_r=lstm_layer_r.output_sent_rep # (batch_size, hidden_size) HL_layer_1_input = T.concatenate([sent_embeddings_l,sent_embeddings_r, sent_embeddings_l*sent_embeddings_r, cosine_matrix1_matrix2_rowwise(sent_embeddings_l,sent_embeddings_r).dimshuffle(0,'x')],axis=1) HL_layer_1_input_size = hidden_size*3+1 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size, activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size, n_out=hidden_size, activation=T.tanh) #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative LR_input_size=HL_layer_1_input_size+2*hidden_size U_a = create_ensemble_para(rng, 3, LR_input_size) # the weight matrix hidden_size*2 LR_b = theano.shared(value=np.zeros((3,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para=[U_a, LR_b] LR_input=T.concatenate([HL_layer_1_input, HL_layer_1.output, HL_layer_2.output],axis=1) layer_LR=LogisticRegression(rng, input=T.tanh(LR_input), n_in=LR_input_size, n_out=3, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector loss=layer_LR.negative_log_likelihood(labels) #for classification task, we usually used negative log likelihood as loss, the lower the better. params = [embeddings]+NN_para+LR_para+HL_layer_1.params+HL_layer_2.params # put all model parameters together # L2_reg =L2norm_paraList([embeddings,conv_W, U_a]) # diversify_reg= Diversify_Reg(U_a.T)+Diversify_Reg(conv_W_into_matrix) cost=loss#+Div_reg*diversify_reg#+L2_weight*L2_reg grads = T.grad(cost, params) # create a list of gradients for all model parameters accumulator=[] for para_i in params: eps_p=np.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): acc = acc_i + T.sqr(grad_i) updates.append((param_i, param_i - learning_rate * grad_i / (T.sqrt(acc)+1e-8))) #1e-8 is add to get rid of zero division updates.append((acc_i, acc)) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time= mid_time epoch = 0 done_looping = False n_train_batches=train_size/batch_size train_batch_start=list(np.arange(n_train_batches)*batch_size)+[train_size-batch_size] n_dev_batches=dev_size/batch_size dev_batch_start=list(np.arange(n_dev_batches)*batch_size)+[dev_size-batch_size] n_test_batches=test_size/batch_size test_batch_start=list(np.arange(n_test_batches)*batch_size)+[test_size-batch_size] max_acc_dev=0.0 max_acc_test=0.0 while epoch < n_epochs: epoch = epoch + 1 train_indices = range(train_size) random.Random(200).shuffle(train_indices) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed iter_accu=0 cost_i=0.0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu +1 iter_accu+=1 train_id_batch = train_indices[batch_id:batch_id+batch_size] cost_i+= train_model( train_sents_l[train_id_batch], train_masks_l[train_id_batch], train_sents_r[train_id_batch], train_masks_r[train_id_batch], train_labels_store[train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data if iter%500==0: print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' past_time = time.time() # if epoch >=3 and iter >= len(train_batch_start)*2.0/3 and iter%500==0: # print 'Epoch ', epoch, 'iter '+str(iter)+' average cost: '+str(cost_i/iter), 'uses ', (time.time()-past_time)/60.0, 'min' # past_time = time.time() error_sum=0.0 for dev_batch_id in dev_batch_start: # for each test batch error_i=dev_model( dev_sents_l[dev_batch_id:dev_batch_id+batch_size], dev_masks_l[dev_batch_id:dev_batch_id+batch_size], dev_sents_r[dev_batch_id:dev_batch_id+batch_size], dev_masks_r[dev_batch_id:dev_batch_id+batch_size], dev_labels_store[dev_batch_id:dev_batch_id+batch_size] ) error_sum+=error_i dev_accuracy=1.0-error_sum/(len(dev_batch_start)) if dev_accuracy > max_acc_dev: max_acc_dev=dev_accuracy print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev #best dev model, do test error_sum=0.0 for test_batch_id in test_batch_start: # for each test batch error_i=test_model( test_sents_l[test_batch_id:test_batch_id+batch_size], test_masks_l[test_batch_id:test_batch_id+batch_size], test_sents_r[test_batch_id:test_batch_id+batch_size], test_masks_r[test_batch_id:test_batch_id+batch_size], test_labels_store[test_batch_id:test_batch_id+batch_size] ) error_sum+=error_i test_accuracy=1.0-error_sum/(len(test_batch_start)) if test_accuracy > max_acc_test: max_acc_test=test_accuracy print '\t\tcurrent testbacc:', test_accuracy, '\t\t\t\t\tmax_acc_test:', max_acc_test else: print 'current dev_accuracy:', dev_accuracy, '\t\t\t\t\tmax max_acc_dev:', max_acc_dev print 'Epoch ', epoch, 'uses ', (time.time()-mid_time)/60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def trainConvNet(data_xy, inp_dim =10, n_epochs = 3, nkerns=[5, 10], batch_size=500, learning_rate=0.1): with open("metrics.txt", "a") as f: f.write("**********\n") f.write("Learning rate: {0}\n".format(learning_rate)) train_x, train_y, test_x, test_y, valid_x, valid_y = data_xy n_train_batches = train_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_x.get_value(borrow=True).shape[0] / batch_size print '...building the model' kern0_dim = 3 kern1_dim = 2 pool0_dim = 2 pool1_dim = 1 if inp_dim==20: kern0_dim = 3 kern1_dim = 2 pool0_dim = 2 pool1_dim = 1 if inp_dim==24: kern0_dim = 5 kern1_dim = 3 pool0_dim = 2 pool1_dim = 1 if inp_dim==30: kern0_dim = 7 kern1_dim = 5 pool0_dim = 2 pool1_dim = 1 index = T.lscalar() x = T.tensor4('x') y = T.ivector('y') rng = numpy.random.RandomState(23455) layer0_input = x.reshape((batch_size, THREE, inp_dim, inp_dim)) layer0 = LeNetConvPoolLayer( rng, input = layer0_input, image_shape=(batch_size, THREE, inp_dim, inp_dim), filter_shape=(nkerns[0], 3, kern0_dim, kern0_dim), poolsize=(pool0_dim, pool0_dim) ) inp1_dim = (inp_dim-kern0_dim+1)/pool0_dim layer1 = LeNetConvPoolLayer( rng, input = layer0.output, image_shape=(batch_size, nkerns[0], inp1_dim, inp1_dim), filter_shape=(nkerns[1], nkerns[0], kern1_dim, kern1_dim), poolsize=(pool1_dim, pool1_dim) ) layer2_input = layer1.output.flatten(2) inp2_dim = (inp1_dim-kern1_dim+1)/pool1_dim layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1]*inp2_dim*inp2_dim, n_out=300, activation=T.tanh ) layer3 = LogisticRegression(input=layer2.output, n_in=300, n_out=10) cost = layer3.negative_log_likelihood(y) test_model = theano.function([index], layer3.errors(y), givens={ x: test_x[index*batch_size: (index+1)*batch_size], y: test_y[index*batch_size: (index+1)*batch_size] }) validate_model = theano.function([index], layer3.errors(y), givens={ x: valid_x[index*batch_size: (index+1)*batch_size], y: valid_y[index*batch_size: (index+1)*batch_size] }) params = layer3.params + layer2.params + layer1.params + layer0.params grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function([index], cost, updates=updates, givens={ x: train_x[index*batch_size: (index+1)*batch_size], y: train_y[index*batch_size: (index+1)*batch_size] }) print 'training... ' patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%\n' %(epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) f.write("Epoch: {0}\n".format(epoch)) f.write("Validation loss: {0}\n".format(this_validation_loss*100)) f.write("Cost: {0}\n".format(cost_ij)) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') %(epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience<=iter: done_looping=True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print ('saving params for patch width: %i...' %(inp_dim)) save_file = open('param'+str(inp_dim)+'.pkl', 'wb') W0 = layer0.params[0]; b0 = layer0.params[1] W1 = layer1.params[0]; b1 = layer1.params[1] cPickle.dump(W0.get_value(borrow=True), save_file, -1) cPickle.dump(b0.get_value(borrow=True), save_file, -1) cPickle.dump(W1.get_value(borrow=True), save_file, -1) cPickle.dump(b1.get_value(borrow=True), save_file, -1) save_file.close()
def __init__(self, batch_size, kernels, input_dimensions, convolution_dimensions, pool_sizes, stride_sizes, layer_pattern, relu_pattern, dropout_rate, rng_seed=None, base_learning_rate=0.05, momentum=0.8, learning_decay_per_epoch=0.91, l2_norm=0, name="default", param_index=0, address='', n_epochs=200, batch_normalization_pattern=None, batch_norm_learning_rate=0.1, batch_norm_decay_per_epoch=0.95, batchnorm_vals_filename=None, batchnorm_slide_percent=0.): """ batch_size - int - size of each batch kernels - int array - number of general units each layer (incl. input/output) input_dimensions - int array[2] - dimensions of input convolution_dimensions - int array[2] array - dimensions of each convolution pool_sizes - int array[2] array - dimensions of pooling for each convolution stride_sizes - int array - length of strides for each convolutional layer (this overrides aspects of pooling behavior) layer_pattern - ['I','C',...,'C','F',...,'F','O'] - indicates pattern of layers relu_pattern - boolean array that describes if convolutional layers should be rectified; doesn't do anything for other types of layers (including input) dropout_rate - float - rate of dropout for network weights rng_seed - int - seed for random number generator; None defaults to random base_learning_rate - floatX - initial learning rate momentum - floatX - amount that learning rate carries over through iterations learning_decay_per_epoch - floatX - factor for decreasing learning rate over epochs name - string that describes the beginning of the filenames of the network pickle param_index - integer determined a priori to index the param configurations and show it in the filename batchnorm_vals_filename - has to be constructed by separate file; pre-defines mean and sd of each layer for a nn...might be preferred to use sliding instead, as batchnorm_slide_percent - sort of like momentum, but for calculations of batch-normalization means and standard deviations """ #initialize arrays containing basic information and hyperparameters self.layers = [] self.uses_batch_normalization = bool(batch_normalization_pattern) self.batch_norm_pattern = batch_normalization_pattern self.batchnorm_vals_filename = batchnorm_vals_filename self.batchnorm_slide_percent = batchnorm_slide_percent if not self.uses_batch_normalization: self.batch_norm_pattern = [False for _ in relu_pattern] self.address = address #replace future instances of self.kernel self.kernels = kernels self.input_dimensions = input_dimensions self.output_size = kernels[-1:][0] self.inputs = [] self.batch_size = batch_size self.x = x = T.ftensor4('x') self.y = y = T.ivector('y') self.rng = np.random.RandomState(rng_seed) self.name = name self.n_epochs = n_epochs self.shapes = [(input_dimensions[0], input_dimensions[1])] print "input shape: " + str(self.shapes) self.convolution_dimensions = convolution_dimensions self.rng_seed = rng_seed self.layer_pattern = layer_pattern self.current_batch_index = 0 self.batch_size = batch_size self.pool_sizes = pool_sizes self.stride_sizes = stride_sizes self.relu_pattern = relu_pattern #if the rate is a float, each layer has the same rate if type(dropout_rate) == type(1.1): dropout_rate = [dropout_rate for _ in layer_pattern] self.dropout_rate = dropout_rate self.learning_decay_per_epoch = learning_decay_per_epoch self.l2_norm = l2_norm #get some info from prepare_image_data.py #files_list, outputs, y_dim = prepare_image_data.get_data() #self.files_list = files_list #self.y_dim = y_dim #self.outputs=outputs self.fetcher = prepare_image_data.fetcher(self.batch_size) #indexing information self.ratios = np.asarray([0.6, 0.2, 0.2]) self.index = index = T.lscalar() #temporarily hardcoded self.n_train_batches = 400 self.n_valid_batches = 120 self.n_test_batches = 120 self.cat_labels = self.fetcher.valid_names self.y_dim = len(self.cat_labels) self.momentum = theano.shared(np.float32(momentum)) self.base_learning_rate = np.float32(base_learning_rate) self.learning_rate = theano.shared( np.float32(base_learning_rate * (1 - momentum))) self.index = index = T.lscalar() self.momentum_raw = momentum self.learning_rate_raw = self.learning_rate.get_value() if self.uses_batch_normalization: self.batch_norm_learning_rate_raw = batch_norm_learning_rate self.batch_norm_learning_rate = theano.shared( np.float32(self.batch_norm_learning_rate_raw)) self.epoch = 0 #initialize basic file shapes #recent change: changed kernel_sizes to self.kernels self.training_x = theano.shared(np.zeros( shape=(batch_size, self.kernels[0], input_dimensions[0], input_dimensions[1]), dtype=theano.config.floatX), borrow=True) self.input = self.x.reshape((self.batch_size, self.kernels[0], self.shapes[0][0], self.shapes[0][1])) #updated database-based retrieval self.training_y = theano.shared(np.zeros(shape=self.batch_size, dtype=np.int32), borrow=True) self.testing_x = theano.shared(np.zeros( shape=(self.batch_size, kernels[0], input_dimensions[0], input_dimensions[1]), dtype=theano.config.floatX), borrow=True) self.testing_y = theano.shared(np.zeros(shape=self.batch_size, dtype=np.int32), borrow=True) self.validation_x = theano.shared(np.zeros( shape=(self.batch_size, kernels[0], input_dimensions[0], input_dimensions[1]), dtype=theano.config.floatX), borrow=True) self.validation_y = theano.shared(np.zeros(shape=self.batch_size, dtype=np.int32), borrow=True) #load fixed mean and sd values if file exists if self.batchnorm_vals_filename <> None: self.batchnorm_fixed_values = pickle.load( self.batchnorm_vals_filename) else: self.batchnorm_fixed_values = [ None for _ in range(len(layer_pattern)) ] ###begin creation of layers #I = "input";C = "Convolutional"; F = "Fully-Connected", O = "Output" for i, pattern in enumerate(layer_pattern): if pattern == "I": self.inputs.append(self.input) print 'inserted input' elif pattern == "C": self.layers.append( NetConvPoolLayer( self.rng, input = self.inputs[i-1], image_shape=( batch_size,kernels[i-1], self.shapes[i-1][0], self.shapes[i-1][1] ), filter_shape=( kernels[i], kernels[i-1], self.convolution_dimensions[i-1][0], self.convolution_dimensions[i-1][1]), poolsize = pool_sizes[i-1], stride = stride_sizes[i-1], dropout_percent = self.dropout_rate[i], batch_norm = self.batch_norm_pattern[i], batchnorm_slide_percent = self.batchnorm_slide_percent, precalculated_batchnorm_values = self.\ batchnorm_fixed_values[i-1]) ) x_new = ( self.shapes[i-1][0] - self.convolution_dimensions[i-1][0] + \ 1 - (pool_sizes[i-1][0] - stride_sizes[i-1][0]))/\ (stride_sizes[i-1][0] ) y_new = ( self.shapes[i-1][1] - self.convolution_dimensions[i-1][1] + 1 -\ (pool_sizes[i-1][1] - stride_sizes[i-1][1]))/\ (stride_sizes[i-1][1] ) self.inputs.append(self.layers[i - 1].output) self.shapes.append((x_new, y_new)) print "self.shapes: " + str(self.shapes) print 'added convolution layer' elif pattern == "F": if layer_pattern[i - 1] == "C": next_input = self.inputs[i - 1].flatten(2) else: next_input = self.inputs[i - 1] self.layers.append( HiddenLayer(self.rng, input=next_input, n_in=kernels[i - 1] * self.shapes[i - 1][0] * self.shapes[i - 1][1], n_out=kernels[i], activation=T.tanh, dropout_rate=self.dropout_rate[i])) self.inputs.append(self.layers[i - 1].output) #the shape is only used to determine dimensions of the next layer self.shapes.append((1, 1)) #see if this fixes issue print 'added fully-connected hidden layer, shape=%s' %\ str(self.shapes[-1]) else: if layer_pattern[i - 1] == "C": next_input = self.inputs[i - 1].flatten(2) else: next_input = self.inputs[i - 1] self.layers.append( LogisticRegression(input=next_input, n_in=kernels[i - 1], n_out=self.output_size, rng=self.rng, dropout_rate=self.dropout_rate[i])) last_index = i - 1 print 'added logistic layer' zero = np.float32(0.) self.L2_penalty = theano.shared(np.float32(l2_norm)) self.params = params = [param for layer in self.layers \ for param in layer.params] self.cost = self.layers[last_index].negative_log_likelihood(self.y) +\ self.L2_penalty * ( T.sum([T.sum(self.layers[q].W * self.layers[q].W)\ for q in range(len(self.layers))])) #updating functions (incl. momentum) #update 1 (only used for derivation in update #4) self.old_updates = [theano.shared(zero * param_i.get_value())\ for param_i in params] self.current_delta = [theano.shared(np.float32(zero * param_i.get_value()))\ for param_i in params] self.grads = T.grad(self.cost, params) #update 2 self.current_change_update = [ (current_delta_i, self.learning_rate * grad_i +\ self.momentum * old_updates_i)\ for current_delta_i,grad_i, old_updates_i in\ zip(self.current_delta,self.grads,self.old_updates) ] #update 3 updates = [ ( param_i,param_i - current_delta_i) for param_i, current_delta_i in\ zip(params,self.current_delta)] #self.updates = [] #update 4 (derived from update #1) momentum_updates = [(old_updates_i, current_delta_i)\ for old_updates_i, current_delta_i in\ zip(self.old_updates,self.current_delta)] #self.momentum_updates = [] #now batch-normalization updates when needed batchnorm_sliding_updates = [] for layer in self.layers: if not isinstance(layer, NetConvPoolLayer): continue if layer.batchnorm_slide_percent <> 0.: batchnorm_sliding_updates += [ (layer.sd_input_old, layer.sd_input), (layer.means_old, layer.sd_input) ] #combined updates self.all_updates = self.current_change_update + updates +\ momentum_updates + batchnorm_sliding_updates #test model function self.test_model = theano.function([], self.layers[last_index].errors( self.y), givens={ x: self.testing_x, y: self.testing_y }) #validation model function self.validate_model = theano.function([], self.layers[last_index].errors( self.y), givens={ x: self.validation_x, y: self.validation_y }) #training function self.train_model = theano.function([], self.cost, updates=self.all_updates, givens={ x: self.training_x, y: self.training_y }) self.patience = 20000 self.patience_increase = 3 self.improvement_threshold = 0.995 self.validation_frequency = min(self.n_train_batches, self.patience // 2) self.best_validation_loss = np.inf self.best_iter = 0 #DEPRECATED self.itermode = 'train' self.test_score = 0. self.start_time = timeit.default_timer() self.epoch = 0 self.iter_i = 0 # renamed bc `iter` is reserved self.done_looping = False self.param_index = param_index #constant-defined stuff self.improvement_threshold = 0.995 self.validation_frequency = min(self.n_train_batches, self.patience // 2) self.done_looping = False print 'initialized neural network object'
def work(mode, data_name, test_dataname, pooling_mode="average_exc_pad"): print "mode: ", mode print "data_name: ", data_name print "pooling_mode: ", pooling_mode print "Started!" rng = numpy.random.RandomState(23455) docSentenceCount = T.ivector("docSentenceCount") sentenceWordCount = T.ivector("sentenceWordCount") corpus = T.matrix("corpus") corpusPos = T.matrix("corpusPos") docLabel = T.ivector('docLabel') corpus0 = T.concatenate([corpus, corpusPos], axis=1) # for list-type data layer0 = DocEmbeddingNN(corpus0, docSentenceCount, sentenceWordCount, rng, \ wordEmbeddingDim=249, \ sentenceLayerNodesNum=50, \ sentenceLayerNodesSize=[5, 249], \ docLayerNodesNum=10, \ docLayerNodesSize=[3, 50], pooling_mode=pooling_mode) layer1 = HiddenLayer(rng, input=layer0.output, n_in=layer0.outputDimension, n_out=10, activation=T.tanh) layer2 = LogisticRegression(input=layer1.output, n_in=10, n_out=2) # construct the parameter array. params = layer2.params + layer1.params + layer0.params # Load the parameters last time, optionally. # data_name = "car" para_path = "data/" + data_name + "/model/multi_input_mergeinput" + pooling_mode + ".model" traintext = "data/" + data_name + "/train/text" trainlabel = "data/" + data_name + "/train/label" testtext = "data/" + test_dataname + "/test/text" testlabel = "data/" + test_dataname + "/test/label" loadParamsVal(para_path, params) if (mode == "train" or mode == "test"): learning_rate = 0.1 error = layer2.errors(docLabel) cost = layer2.negative_log_likelihood(docLabel) grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] print "Loading test data." cr_test = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=testtext, labelset=testlabel) validDocMatrixes, validDocSentenceNums, validSentenceWordNums, validIds, validLabels, _, validPosList = cr_test.getCorpus( [0, 1000]) # print "Right answer: " # print zip(validIds, validLabels) validDocMatrixes = transToTensor(validDocMatrixes, theano.config.floatX) validDocSentenceNums = transToTensor(validDocSentenceNums, numpy.int32) validSentenceWordNums = transToTensor(validSentenceWordNums, numpy.int32) validLabels = transToTensor(validLabels, numpy.int32) validPosList = transToTensor(validPosList, theano.config.floatX) print "Data loaded." valid_model = theano.function( [], [ cost, error, layer2.y_pred, docLabel, T.transpose(layer2.p_y_given_x)[1] ], givens={ corpus: validDocMatrixes, corpusPos: validPosList, docSentenceCount: validDocSentenceNums, sentenceWordCount: validSentenceWordNums, docLabel: validLabels }) # ####Validate the model#### costNum, errorNum, pred_label, real_label, pred_prob = valid_model() print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "Valid Pred: ", pred_label # print "pred_prob: ", pred_prob fpr, tpr, _ = roc_curve(real_label, pred_prob) if mode == "test": print "tpr_all: ", tpr print "fpr_all: ", fpr roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) ar = (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", ar print "threshold: ", threshold[index_of_one] if mode == "test": valid_model.free() return errorNum, roc_auc, tpr[index_of_one], fpr[index_of_one], ar print "Loading train data." cr_train = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset=traintext, labelset=trainlabel) docMatrixes, docSentenceNums, sentenceWordNums, ids, labels, _, posList = cr_train.getCorpus( [0, 100000]) # print "Right answer: " # print zip(ids, labels) docMatrixes = transToTensor(docMatrixes, theano.config.floatX) docSentenceNums = transToTensor(docSentenceNums, numpy.int32) sentenceWordNums = transToTensor(sentenceWordNums, numpy.int32) labels = transToTensor(labels, numpy.int32) posList = transToTensor(posList, theano.config.floatX) # valid_cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/valid/split", labelset="data/valid/label.txt") print index = T.lscalar("index") batchSize = 10 n_batches = (len(docSentenceNums.get_value()) - 1 - 1) / batchSize + 1 print print "Train set size is ", len(docMatrixes.get_value()) print "Validating set size is ", len(validDocMatrixes.get_value()) print "Batch size is ", batchSize print "Number of training batches is ", n_batches print "Compiling computing graph." # for list-type data train_model = theano.function( [index], [cost, error, layer2.y_pred, docLabel], updates=updates, givens={ corpus: docMatrixes, corpusPos: posList, docSentenceCount: docSentenceNums[index * batchSize:(index + 1) * batchSize + 1], sentenceWordCount: sentenceWordNums, docLabel: labels[index * batchSize:(index + 1) * batchSize], }) print "Compiled." print "Start to train." epoch = 0 n_epochs = 10 ite = 0 while (epoch < n_epochs): epoch = epoch + 1 ####################### for i in range(n_batches): # for list-type data print ".", costNum, errorNum, pred_label, real_label = train_model(i) print ".", ite = ite + 1 # for padding data # costNum, errorNum = train_model(docMatrixes, labels) # del docMatrixes, docSentenceNums, sentenceWordNums, labels # print ".", if (ite % 10 == 0): print print "@iter: ", ite print "Cost: ", costNum print "Error: ", errorNum # Validate the model costNum, errorNum, pred_label, real_label, pred_prob = valid_model( ) print "Valid current model:" print "Cost: ", costNum print "Error: ", errorNum # print "pred_prob: ", pred_prob # print "Valid Pred: ", pred_label fpr, tpr, _ = roc_curve(real_label, pred_prob) roc_auc = auc(fpr, tpr) print "data_name: ", data_name print "test_dataname: ", test_dataname print "ROC: ", roc_auc fpr, tpr, threshold = roc_curve(real_label, pred_label) index_of_one = list(threshold).index(1) print "TPR: ", tpr[index_of_one] print "FPR: ", fpr[index_of_one] print "AR: ", (tpr[index_of_one] + 1 - fpr[index_of_one]) / 2 print "threshold: ", threshold[index_of_one] # Save model print "Saving parameters." saveParamsVal(para_path, params) print "Saved." valid_model.free() train_model.free() elif (mode == "deploy"): print "Compiling computing graph." output_model = theano.function( [corpus, docSentenceCount, sentenceWordCount], [layer2.y_pred]) print "Compiled." cr = CorpusReader(minDocSentenceNum=5, minSentenceWordNum=5, dataset="data/train_valid/split") count = 21000 while (count <= 21000): docMatrixes, docSentenceNums, sentenceWordNums, ids = cr.getCorpus( [count, count + 100]) docMatrixes = numpy.matrix(docMatrixes, dtype=theano.config.floatX) docSentenceNums = numpy.array(docSentenceNums, dtype=numpy.int32) sentenceWordNums = numpy.array(sentenceWordNums, dtype=numpy.int32) print "start to predict." pred_y = output_model(docMatrixes, docSentenceNums, sentenceWordNums) print "End predicting." print "Writing resfile." # print zip(ids, pred_y[0]) f = file("data/test/res/res" + str(count), "w") f.write(str(zip(ids, pred_y[0]))) f.close() print "Written." + str(count) count += 100
def evaluate_lenet5(learning_rate=0.12, n_epochs=200, nkerns=[20, 30 ,20,50,20 ], batch_size=20): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState() datasets = load_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = int(train_set_x.get_value(borrow=True).shape[0] / batch_size) n_valid_batches = int(valid_set_x.get_value(borrow=True).shape[0] / batch_size) n_test_batches = int(test_set_x.get_value(borrow=True).shape[0] / batch_size) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') layer0_input = x.reshape((batch_size, 1, 64, 64)) layer0 = ConvLayer( rng, input=layer0_input, image_shape=(batch_size, 1, 64, 64), filter_shape=(nkerns[0], 1, 1, 1), ) layer0b = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 64, 64), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) layer1 = ConvLayer( rng, input=layer0b.output, image_shape=(batch_size, nkerns[1], 30, 30), filter_shape=(nkerns[2], nkerns[1], 1, 1), ) layer1b = LeNetConvPoolLayer( rng, input=layer1.output, image_shape=(batch_size, nkerns[2], 30, 30), filter_shape=(nkerns[3], nkerns[2], 3, 3), poolsize=(2, 2) ) layer1c = ConvLayer( rng, input=layer1b.output, image_shape=(batch_size, nkerns[3], 14, 14), filter_shape=(nkerns[4], nkerns[3], 1, 1), ) layer1d = LeNetConvPoolLayer( rng, input=layer1c.output, image_shape=(batch_size, nkerns[4], 14, 14), filter_shape=(40, nkerns[4], 3, 3), poolsize=(2, 2) ) # construct a fully-connected sigmoidal layer layer2 = AveragePoolLayer( input=layer1d.output, poolsize=(6,6) ) layer3_input = theano.tensor.flatten(layer2.output, outdim=2)#layer2.output.flatten(outdim=2) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer3_input, n_in=40, n_out=40) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer1d.params + layer1c.params + layer1b.params + layer1.params + layer0b.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 2000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: learning_rate*=0.8 if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print(('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.))) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print(((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.))) best_score = (this_validation_loss + test_score)/2 #with open('inception.pkl', 'wb') as f: #pickle.dump([layer0,layer0b,layer1,layer1b,layer2,layer3], f) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print(('Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.))) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) return best_score
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """ 该类可实现可变层数的DBN :param numpy_rng: numpy.random.RandomState 用于初始化权重的numpy随机数 :param theano_rng: theano.tensor.shared_randomstreams.RandomStreams 如果输入为None :param n_ins: int DBN输入量的维度 :param hidden_layers_size: list 隐层输入量的维度 :param n_outs: int 网络输出量的维度 :return: """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) #设置符号变量 self.x = T.matrix('x') self.y = T.ivector('y') #DBN是一个MLP,中间层的权重是在不同的RBM之间共享的。 #首先构造DBN为一个深层多感知器。在构造每个sigmoid层时, #同样构造RBM与之共享变量。在预训练阶段,需要训练三个RBM(同样改变MLP的权重, #微调阶段,通过在MLP上随机梯度下降法完成DBN训练。 for i in xrange(self.n_layers): #构造sigmoid层, #对于第一层,输入量大小是网络的输入量大小 #对于其它层,输入量大小是下层隐层单元的数量 if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] #对于第一层,输入是网络的输入 #对于其它层,输入是下层隐层的激活函数值 if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[i - 1].output #定义sigmoid函数 sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) self.sigmoid_layers.append(sigmoid_layer) #sigmoid_layers的参数是DBN的参数。而RBM中可见层的偏置只是RBM的参数,而不属于DBN self.params.extend(sigmoid_layer.params) #构造RBM共享权重 rbm_layer = RBM(input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b, numpy_rng=numpy_rng, theano_rng=theano_rng) self.rbm_layers.append(rbm_layer) #添加logistic到网络的顶部 self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) #计算微调阶段的代价函数,定义为logistic回归(输出)层的负对数似然函数 self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) #给定self.x和self.y,计算每个minibatch的误差 self.errors = self.logLayer.errors(self.y)
def __init__(self, rng, input, layer_sizes, use_bias=True, rectifier=None): if rectifier == 'soft': rectified_linear_activation = lambda x: T.nnet.softplus(x) elif rectifier == 'hard': rectified_linear_activation = lambda x: T.maximum(0.0, x) # Set up all the hidden layers weight_matrix_sizes = zip(layer_sizes, layer_sizes[1:]) self.layers = [] self.dropout_layers = [] next_layer_input = input # dropout the input with prob 0.2 next_dropout_layer_input = _dropout_from_layer(rng, input, p=0.2) for n_in, n_out in weight_matrix_sizes[:-1]: next_dropout_layer = DropoutHiddenLayer( rng=rng, input=next_dropout_layer_input, activation=rectified_linear_activation, n_in=n_in, n_out=n_out, use_bias=use_bias) self.dropout_layers.append(next_dropout_layer) next_dropout_layer_input = next_dropout_layer.output # Reuse the paramters from the dropout layer here, in a different # path through the graph. next_layer = HiddenLayer(rng=rng, input=next_layer_input, activation=rectified_linear_activation, W=next_dropout_layer.W * 0.5, b=next_dropout_layer.b, n_in=n_in, n_out=n_out, use_bias=use_bias) self.layers.append(next_layer) next_layer_input = next_layer.output # Set up the output layer n_in, n_out = weight_matrix_sizes[-1] dropout_output_layer = LogisticRegression( input=next_dropout_layer_input, n_in=n_in, n_out=n_out) self.dropout_layers.append(dropout_output_layer) # Again, reuse paramters in the dropout output. output_layer = LogisticRegression(input=next_layer_input, W=dropout_output_layer.W * 0.5, b=dropout_output_layer.b, n_in=n_in, n_out=n_out) self.layers.append(output_layer) # Use the negative log likelihood of the logistic regression layer as # the objective. self.dropout_negative_log_likelihood = self.dropout_layers[ -1].negative_log_likelihood self.dropout_errors = self.dropout_layers[-1].errors self.negative_log_likelihood = self.layers[-1].negative_log_likelihood self.errors = self.layers[-1].errors # Grab all the parameters together. self.params = [ param for layer in self.dropout_layers for param in layer.params ]
def __init__(self, rng, n_in, n_hidden, n_out, x=None, y=None, activation=T.tanh, lambda_reg=0.001, alpha_reg=0.0): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie :type lambda_reg: float :param lambda_reg: paramter to control the sparsity of weights by l_1 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * \sum||W||_2^2 + alpha_reg \sum||W||_1 ). Thus, the larger lambda_reg is, the sparser the weights are. :type alpha_reg: float :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * \sum||W||_2^2 + alpha_reg \sum||W||_1 ), Thus, the smaller alpha_reg is, the smoother the weights are. """ self.hidden_layers=[] self.params=[] self.n_layers=len(n_hidden) if not x: x=T.matrix('x') self.x=x if not y: y=T.ivector('y') self.y=y for i in range(len(n_hidden)): if i==0: # first hidden layer hd=HiddenLayer(rng=rng, input=self.x, n_in=n_in, n_out=n_hidden[i], activation=activation) else: hd=HiddenLayer(rng=rng, input=self.hidden_layers[i-1].output, n_in=n_hidden[i-1], n_out=n_hidden[i], activation=activation) self.hidden_layers.append(hd) self.params.extend(hd.params) # The logistic regression layer gets as input the hidden units # of the hidden layer if self.n_layers>0: self.logRegressionLayer = LogisticRegression(input=self.hidden_layers[-1].output, n_in=n_hidden[-1], n_out=n_out) else: self.logRegressionLayer = LogisticRegression(input=self.x, n_in=n_in, n_out=n_out) self.params.extend(self.logRegressionLayer.params) # regularization terms L1s=[] L2_sqrs=[] #L1s.append(abs(self.hidden_layers[0].W).sum()) for i in range(len(n_hidden)): L1s.append (abs(self.hidden_layers[i].W).sum()) L2_sqrs.append((self.hidden_layers[i].W ** 2).sum()) L1s.append(abs(self.logRegressionLayer.W).sum()) L2_sqrs.append((self.logRegressionLayer.W ** 2).sum()) self.L1 = T.sum(L1s) self.L2_sqr = T.sum(L2_sqrs) # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors(self.y) # cost function to be minimized self.cost = self.negative_log_likelihood(self.y) \ + lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr + alpha_reg*self.L1) self.y_pred=self.logRegressionLayer.y_pred
def sgd_optimization_mnist(tr_start_index=1, tr_limit=5000, vl_start_index=1, vl_limit=5000, learning_rate=0.015, n_epochs=5000 , output_filename="ls.out"): output_file = open(output_filename,'w') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels in_shape = layer0_input_shape[0] * layer0_input_shape[1] batch_size = tr_limit train_set = tdtf.read_data_patch_to_ndarray(train_dataset_route, tr_start_index, tr_limit) datasets = load_data.shared_dataset(train_set) train_set_x, train_set_y = datasets valid_set = tdtf.read_data_patch_to_ndarray(valid_dataset_route, vl_start_index, vl_limit) print valid_set[1] datasets = load_data.shared_dataset(valid_set) valid_set_x, valid_set_y = datasets # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size #n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size if not if_load_trained_model : trained_model_pkl = open(train_model_route, 'r') trained_model_state_list = cPickle.load(trained_model_pkl) trained_model_state_array = numpy.load(trained_model_pkl) classifier_state = trained_model_state_array[0] classifier = LogisticRegression(input=x, n_in=in_shape, n_out=layer0_output_shape , W=classifier_state[0], b=classifier_state[1]) else: ###################### # BUILD ACTUAL MODEL # ###################### #print '... building the model' # construct the logistic regression class rng = numpy.random.RandomState(23555) W_bound=1 tmp_W = theano.shared(numpy.asarray( rng.uniform(low=0, high=W_bound, size=(in_shape, layer0_output_shape)), dtype=theano.config.floatX), borrow=True) classifier = LogisticRegression(input=x, n_in=in_shape, n_out=layer0_output_shape) #,W=tmp_W) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch validate_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function(inputs=[index], \ outputs=[cost, classifier.errors(y)], \ updates=updates, \ givens={ \ x: train_set_x[index * batch_size:(index + 1) * batch_size], \ y: train_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### #print '... training the model' # early-stopping parameters patience = 50000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() best_train_loss = numpy.inf done_looping = False epoch = 0 last_train_err = 1 last_train_cost = 1 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost, train_err = train_model(minibatch_index) decreasing_rate = (last_train_err - train_err) / (last_train_err) * 100. last_train_err = train_err c_d_rate = (last_train_cost - minibatch_avg_cost) / (last_train_cost) * 100. last_train_cost = minibatch_avg_cost print >> output_file, ('epoch %i, minibatch %i/%i, train_cost %f , train_error %.2f %%, decreasing rate %f %%, cost_decreasing rate %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, minibatch_avg_cost, train_err* 100. ,decreasing_rate ,c_d_rate)) if best_train_loss > train_err: best_train_loss = train_err # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print >> output_file, ('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss # load trained_model to ''' layer_state = classifier.__getstate__() trained_model_list = [layer_state] trained_model_array = numpy.asarray(trained_model_list) classifier_file = open(train_model_route, 'w') cPickle.dump([1,2,3], classifier_file, protocol=2) numpy.save(classifier_file, trained_model_array) classifier_file.close() ''' ''' test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) test_res = [test_results(i) for i in xrange(n_test_batches)] print((' epoch %i, minibatch %i/%i, test error of best' ' model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) ''' if patience <= iter: done_looping = True break end_time = time.clock() print >> output_file, (('Optimization complete with best validation score of %f %%,' 'with test performance %f %%' 'with best train_performance %f %%') % (best_validation_loss * 100., test_score * 100., best_train_loss * 100.)) print >> output_file, 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))) output_file.close()
def __init__(self, rng, input, nkerns, batch_size, image_size, image_dimension): # Reshape matrix of rasterized images of shape (batch_size, size[0] * size[1]) # to a 4D tensor, compatible with our LeNetConvPoolLayer self.layer0_input = input.reshape((batch_size, image_dimension, image_size[0], image_size[1])) # Construct the first convolutional pooling layer: # filtering reduces the image size to (100-3+1 , 100-3+1) = (98, 98) # 4D output tensor is thus of shape (batch_size, nkerns[0], 98, 98) self.layer0 = LeNetConvPoolLayer( rng, input=self.layer0_input, image_shape=(batch_size, image_dimension, image_size[0], image_size[1]), filter_shape=(nkerns[0], image_dimension, 3, 3), poolsize=(2, 2), pool_flag = False ) # Construct the second convolutional pooling layer # filtering reduces the image size to (98-3+1, 98-3+1) = (96, 96) # 4D output tensor is thus of shape (batch_size, nkerns[1], 96, 96) self.layer1 = LeNetConvPoolLayer( rng, input= self.layer0.output, image_shape=(batch_size, nkerns[0], 98, 98), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(2, 2), pool_flag = False ) # Construct the second convolutional pooling layer # filtering reduces the image size to (96-5+1, 96-5+1) = (92, 92) # maxpooling reduces this further to (94/2, 94/2) = (46, 46) # 4D output tensor is thus of shape (batch_size, nkerns[2], 46, 46) self.layer2 = LeNetConvPoolLayer( rng, input = self.layer1.output, image_shape=(batch_size, nkerns[1], 96, 96), filter_shape=(nkerns[2], nkerns[1], 5, 5), poolsize=(2,2), pool_flag = True ) # Construct the second convolutional pooling layer # filtering reduces the image size to (46-7+1, 46-7+1) = (40, 40) # 4D output tensor is thus of shape (batch_size, nkerns[3], 40, 40) self.layer3 = LeNetConvPoolLayer( rng, input=self.layer2.output, image_shape=(batch_size, nkerns[2], 46, 46), filter_shape=(nkerns[3], nkerns[2], 7, 7), poolsize=(2, 2), pool_flag=False ) # Construct the second convolutional pooling layer # filtering reduces the image size to (40-7+1, 40-7+1) = (34, 34) # 4D output tensor is thus of shape (batch_size, nkerns[4], 34, 34) self.layer4 = LeNetConvPoolLayer( rng, input=self.layer3.output, image_shape=(batch_size, nkerns[3], 40, 40), filter_shape=(nkerns[4], nkerns[3], 7, 7), poolsize=(2, 2), pool_flag=False ) # Construct the second convolutional pooling layer # filtering reduces the image size to (34-11+1, 34-11+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[5], 12, 12) self.layer5 = LeNetConvPoolLayer( rng, input=self.layer4.output, image_shape=(batch_size, nkerns[4], 34, 34), filter_shape=(nkerns[5], nkerns[4], 11, 11), poolsize=(2, 2), pool_flag=True ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[5] * 12 * 12), # or (66, 20 * 12 * 12) = (66, 2880) with the default values. self.layer6_input = self.layer5.output.flatten(2) # construct a fully-connected sigmoidal layer self.layer6 = HiddenLayer( rng, input=self.layer6_input, n_in= nkerns[5] * 12 * 12, n_out=1500, activation= T.nnet.relu ) self.layer7 = HiddenLayer( rng, input=self.layer6.output, n_in=1500, n_out=500, activation=T.nnet.relu ) # classify the values of the fully-connected sigmoidal layer # self.layer4 = LogisticRegression(input=self.layer3.output, n_in=300, n_out=10) self.layer8 = LogisticRegression(input = self.layer7.output, n_in=500, n_out=2) # create a list of all model parameters to be fit by gradient descent self.params = self.layer8.params + self.layer7.params + self.layer6.params + \ self.layer5.params + self.layer4.params + self.layer3.params + self.layer2.params + self.layer1.params + self.layer0.params
def __init__(self, input, batch_size, activation, state=None): #layer0_input = x.reshape((batch_size, 3, 13, 13)) rng = np.random.RandomState(23455) img_size = 13 img_channels = NUM_CHANNELS conv_filter_size = 3 conv_filter_stride = 1 # hard coded conv_filter_depth = 16 ## Not used becusee it is hardcoded inside le-net pool_filter_size = 3 pool_filter_stride = 2 conv_pool_output_size = 5 ## 10 fullyconnected_output_size = 16 self.input = input if state is None: conv_pool_layer_state = None fully_connected_layer_state = None log_regression_layer_state = None else: conv_pool_layer_state = state[0:2] fully_connected_layer_state = state[2:4] log_regression_layer_state = state[4:6] self.conv_pool_layer = LeNetConvPoolLayer( rng, input=input, image_shape=(batch_size, img_channels, img_size, img_size), filter_shape=(conv_filter_depth, img_channels, conv_filter_size, conv_filter_size), poolsize=(3, 3), activation=activation, state=conv_pool_layer_state) self.fullyconnected_layer = HiddenLayer( rng, input=self.conv_pool_layer.output.flatten(2), n_in=conv_filter_depth * conv_pool_output_size * conv_pool_output_size, n_out=fullyconnected_output_size, activation=activation, state=fully_connected_layer_state) self.log_regression_layer = LogisticRegression( input=self.fullyconnected_layer.output, n_in=fullyconnected_output_size, n_out=2, state=log_regression_layer_state) self.L1 = (abs(self.conv_pool_layer.W).sum() + abs(self.fullyconnected_layer.W).sum() + abs(self.log_regression_layer.W).sum()) self.params = self.conv_pool_layer.params + self.fullyconnected_layer.params + self.log_regression_layer.params
image_shape=(batch_size, nkerns[0], 20, 20), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) \ ) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 8 * 8, n_out=100, activation=T.tanh \ ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=100, n_out=3 \ ) # definition for theano.function test_results = theano.function(inputs=[x], \ outputs= layer3.y_pred) def load_trained_model(): global if_load_trained_model global train_model_route global layer0_input global layer0 global layer1 global layer2_input global layer2 global layer3
def evaluate_lenet5(learning_rate=0.01, n_epochs=4, emb_size=300, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/' test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt' output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_onlyMT_BBN_epoch4.json' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) word2id = {} # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types( word2id, maxSentLen) train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others( word2id, maxSentLen) test_sents, test_masks, test_lines, word2id = load_official_testData_only_MT( word2id, maxSentLen, test_file_path) label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_p1_sents = np.asarray(train_p1_sents, dtype='int32') train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX) train_p1_labels = np.asarray(train_p1_labels, dtype='int32') train_p1_size = len(train_p1_labels) train_p2_sents = np.asarray(train_p2_sents, dtype='int32') train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX) train_p2_labels = np.asarray(train_p2_labels, dtype='int32') train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32') train_p2_size = len(train_p2_labels) ''' combine train_p1 and train_p2 ''' train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0) train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0) train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0) train_size = train_p1_size + train_p2_size test_sents = np.asarray(test_sents, dtype='int32') test_masks = np.asarray(test_masks, dtype=theano.config.floatX) # test_labels=np.asarray(all_labels[2], dtype='int32') test_size = len(test_sents) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + '100k-ENG-multicca.300.ENG.txt', emb_root + '100k-IL9-multicca.d300.IL9.txt' ], 300) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 other_labels = T.imatrix() #batch*4 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] ''' multi-CNN ''' conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l ''' cross-DNN-dataless ''' #first map label emb into hidden space HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, emb_size, hidden_size[0]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=bow_des, n_in=emb_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) des_rep_hidden = HL_layer_1.output #(type_size, hidden_size) dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot( des_rep_hidden.T)) #(batch_size, type_size) dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) acnn_LR_input = T.concatenate([ dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix, top_k_score_matrix, sent_embeddings, sent_embeddings2, gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb ], axis=1) acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12) acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size, 16) acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b] acnn_other_layer_LR = LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=16, W=acnn_other_U_a, b=acnn_other_LR_b) acnn_other_prob_matrix = T.nnet.softmax( acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4))) acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape( (batch_size, 4, 4)) acnn_other_prob = acnn_other_prob_tensor3[ T.repeat(T.arange(batch_size), 4), T.tile(T.arange(4), (batch_size)), other_labels.flatten()] acnn_other_field_loss = -T.mean(T.log(acnn_other_prob)) params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params # put all model parameters together cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() + (conv_att_W**2).sum() + (conv_att_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) other_paras = params + acnn_other_LR_para cost_other = cost + acnn_other_field_loss other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate) ''' testing ''' ensemble_NN_scores = acnn_score_matrix #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = ensemble_NN_scores #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) ''' test for other fields ''' sum_tensor3 = acnn_other_prob_tensor3 #(batch, 4, 3) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_p1_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') train_p2_model = theano.function([ sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask, other_labels ], cost_other, updates=other_updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], [binarize_prob, ensemble_scores, sum_tensor3], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_train_p2_batches = train_p2_size / batch_size train_p2_batch_start = list(np.arange(n_train_p2_batches) * batch_size) + [train_p2_size - batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] train_p2_batch_start_set = set(train_p2_batch_start) # max_acc_dev=0.0 # max_meanf1_test=0.0 # max_weightf1_test=0.0 train_indices = range(train_size) train_p2_indices = range(train_p2_size) cost_i = 0.0 other_cost_i = 0.0 min_mean_frame = 100.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) random.Random(100).shuffle(train_p2_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_p1_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) if batch_id in train_p2_batch_start_set: train_p2_id_batch = train_p2_indices[batch_id:batch_id + batch_size] other_cost_i += train_p2_model( train_p2_sents[train_p2_id_batch], train_p2_masks[train_p2_id_batch], train_p2_labels[train_p2_id_batch], label_sent, label_mask, train_p2_other_labels[train_p2_id_batch]) # else: # random_batch_id = random.choice(train_p2_batch_start) # train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size] # other_cost_i+=train_p2_model( # train_p2_sents[train_p2_id_batch], # train_p2_masks[train_p2_id_batch], # train_p2_labels[train_p2_id_batch], # label_sent, # label_mask, # train_p2_other_labels[train_p2_id_batch] # ) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), str( other_cost_i / iter), 'uses ', (time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_types = [] pred_confs = [] pred_others = [] for i, test_batch_id in enumerate( test_batch_start): # for each test batch pred_types_i, pred_conf_i, pred_fields_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) if i < len(test_batch_start) - 1: pred_types.append(pred_types_i) pred_confs.append(pred_conf_i) pred_others.append(pred_fields_i) else: pred_types.append(pred_types_i[-n_test_remain:]) pred_confs.append(pred_conf_i[-n_test_remain:]) pred_others.append(pred_fields_i[-n_test_remain:]) pred_types = np.concatenate(pred_types, axis=0) pred_confs = np.concatenate(pred_confs, axis=0) pred_others = np.concatenate(pred_others, axis=0) mean_frame = generate_2018_official_output( test_lines, output_file_path, pred_types, pred_confs, pred_others, min_mean_frame) if mean_frame < min_mean_frame: min_mean_frame = mean_frame print '\t\t\t test over, min_mean_frame:', min_mean_frame print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__( self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10, corruption_levels=[0.1, 0.1] ): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels # end-snippet-1 # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP # start-snippet-2 for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # end-snippet-2 # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs ) self.params.extend(self.logLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def __init__(self, rng, n_in=784, n_hidden=[500, 500], n_out=10, lambda_reg=0.001, alpha_reg=0.001): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_in: int :param n_in: dimension of the input to the DBN :type n_hidden: list of ints :param n_hidden: intermediate layers size, must contain at least one value :type n_out: int :param n_out: dimension of the output of the network :type lambda_reg: float :param lambda_reg: paramter to control the sparsity of weights by l_1 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ). Thus, the larger lambda_reg is, the sparser the weights are. :type alpha_reg: float :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ), Thus, the smaller alpha_reg is, the smoother the weights are. """ self.hidden_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(n_hidden) assert self.n_layers > 0 # allocate symbolic variables for the data self.x = T.matrix('x') # the data, each row is a sample self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_in else: input_size = n_hidden[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.hidden_layers[-1].output sigmoid_layer = HiddenLayer(rng=rng, input=layer_input, n_in=input_size, n_out=n_hidden[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.hidden_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=rng, theano_rng=None, input=layer_input, n_visible=input_size, n_hidden=n_hidden[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP if self.n_layers > 0: self.logRegressionLayer = LogisticRegression( input=self.hidden_layers[-1].output, n_in=n_hidden[-1], n_out=n_out) else: self.logRegressionLayer = LogisticRegression(input=self.x, n_in=input_size, n_out=n_out) self.params.extend(self.logRegressionLayer.params) # regularization L1s = [] L2_sqrs = [] for i in range(self.n_layers): L1s.append(abs(self.hidden_layers[i].W).sum()) L2_sqrs.append((self.hidden_layers[i].W**2).sum()) L1s.append(abs(self.logRegressionLayer.W).sum()) L2_sqrs.append((self.logRegressionLayer.W**2).sum()) self.L1 = T.sum(L1s) self.L2_sqr = T.sum(L2_sqrs) # compute the cost for second phase of training, # defined as the negative log likelihood self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood( self.y) self.cost=self.negative_log_likelihood + \ lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr + alpha_reg*self.L1) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logRegressionLayer.errors(self.y) self.y_pred = self.logRegressionLayer.y_pred self.y_pred_prob = self.logRegressionLayer.y_pred_prob
def __init__(self, rng, batch_size=100, input_size=None, nkerns=[4, 4, 4], receptive_fields=((2, 8), (2, 8), (2, 8)), poolsizes=((1, 8), (1, 8), (1, 4)), full_hidden=[16], n_out=10): """ """ self.x = T.matrix(name='x', dtype=theano.config.floatX ) # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of self.batch_size = theano.shared( value=batch_size, name='batch_size') #T.lscalar('batch_size') self.layers = [] self.params = [] for i in range(len(nkerns)): receptive_field = receptive_fields[i] if i == 0: featmap_size_after_downsample = input_size layeri_input = self.x.reshape( (batch_size, 1, featmap_size_after_downsample[0], featmap_size_after_downsample[1])) image_shape = (batch_size, 1, featmap_size_after_downsample[0], featmap_size_after_downsample[1]) filter_shape = (nkerns[i], 1, receptive_field[0], receptive_field[1]) else: layeri_input = self.layers[i - 1].output image_shape = (batch_size, nkerns[i - 1], featmap_size_after_downsample[0], featmap_size_after_downsample[1]) filter_shape = (nkerns[i], nkerns[i - 1], receptive_field[0], receptive_field[1]) layeri = LeNetConvPoolLayer(rng=rng, input=layeri_input, image_shape=image_shape, filter_shape=filter_shape, poolsize=poolsizes[i]) featmap_size_after_conv = get_featmap_size_after_conv( featmap_size_after_downsample, receptive_fields[i]) featmap_size_after_downsample = get_featmap_size_after_downsample( featmap_size_after_conv, poolsizes[i]) self.layers.append(layeri) self.params.extend(layeri.params) # fully connected layer #print 'going to fully connected layer' #layer_full_input = self.layers[-1].output.flatten(2) # construct a fully-connected sigmoidal layer #layer_full = HiddenLayer(rng=rng, input=layer_full_input, # n_in=nkerns[-1] * featmap_size_after_downsample[0] * featmap_size_after_downsample[1], # n_out=full_hidden, activation=T.tanh) #self.layers.append(layer_full) #self.params.extend(layer_full.params) # classify the values of the fully-connected sigmoidal layer #print 'going to output layer' #self.logRegressionLayer = LogisticRegression(input=self.layers[-1].output, n_in=full_hidden, n_out=n_out) #self.params.extend(self.logRegressionLayer.params) # multiple fully connected layers print 'going to fully connected layers' for i in range(len(full_hidden)): if i == 0: layer_full_i_input = self.layers[-1].output.flatten( 2) # the output of the last conv-pool layer n_i_in = nkerns[-1] * featmap_size_after_downsample[ 0] * featmap_size_after_downsample[1] n_i_out = full_hidden[i] else: layer_full_i_input = layer_full_i_output n_i_in = full_hidden[i - 1] n_i_out = full_hidden[i] layer_full_i = HiddenLayer(rng=rng, input=layer_full_i_input, n_in=n_i_in, n_out=n_i_out, activation=T.tanh) self.layers.append(layer_full_i) self.params.extend(layer_full_i.params) layer_full_i_output = layer_full_i.output #self.output=layer_full_i_output # construct an output layer (classes) print 'going to output layer' self.logRegressionLayer = LogisticRegression( input=self.layers[-1].output, n_in=full_hidden[-1], n_out=n_out) self.params.extend(self.logRegressionLayer.params) # the cost we minimize during training is the NLL of the model self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood( self.y) self.cost = self.logRegressionLayer.negative_log_likelihood(self.y) self.errors = self.logRegressionLayer.errors(self.y) self.y_pred = self.logRegressionLayer.y_pred self.y_pred_prob = self.logRegressionLayer.y_pred_prob
def fit(self, X, Y): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ if (self.best_error is None): train_set_x, train_set_y = shared_dataset((X[0], Y[0]-1), borrow=True) valid_set_x, valid_set_y = shared_dataset((X[1], Y[1]-1), borrow=True) n_classes = len(set(Y[0])) input_dimension = X[0].shape[1] validation_size = Y[1].shape[0] train_size = X[0].shape[0] else: Y = numpy.array(Y) train_set_x, train_set_y = shared_dataset((X, Y-1), borrow=True) n_classes = len(set(Y)) input_dimension = X.shape[1] train_size = X.shape[0] n_batches = 1 ###################### # BUILD ACTUAL MODEL # ###################### # print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = numpy.random.RandomState(1234) # construct the MLP class # classifier = MLP(rng=rng, input=x, n_in=datasets[0].shape[1], # n_hidden=n_hidden, n_out=n_classes) # # Since we are dealing with a one hidden layer MLP, this will # translate into a TanhLayer connected to the LogisticRegression # layer; this can be replaced by a SigmoidalLayer, or a layer # implementing any other nonlinearity self.hiddenLayer = HiddenLayer(rng=rng, input=self.x, n_in= input_dimension, n_out=self.n_hidden, activation=T.tanh) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer.output, n_in=self.n_hidden, n_out=n_classes) # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = abs(self.hiddenLayer.W).sum() \ + abs(self.logRegressionLayer.W).sum() # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = (self.hiddenLayer.W ** 2).sum() \ + (self.logRegressionLayer.W ** 2).sum() # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors # the parameters of the model are the parameters of the two layer it is # made out of self.params = self.hiddenLayer.params + self.logRegressionLayer.params self.probabilities = self.logRegressionLayer.p_y_given_x # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = self.negative_log_likelihood(self.y) \ + self.L1_reg * self.L1 \ + self.L2_reg * self.L2_sqr # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch # test_model = theano.function(inputs=[index], # outputs=classifier.errors(y), # givens={ # x: test_set_x[index * batch_size:(index + 1) * batch_size], # y: test_set_y[index * batch_size:(index + 1) * batch_size]}) if (self.best_error is None): self.validate_model = theano.function(inputs=[], outputs=(self.errors(self.y),self.probabilities), givens={ self.x: valid_set_x, self.y: valid_set_y}) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in self.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs updates = [] # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(self.params, gparams): updates.append((param, param - self.learning_rate * gparam)) # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` self.train_model = theano.function(inputs=[], outputs=cost, updates=updates, givens={ self.x: train_set_x, self.y: train_set_y}) ############### # TRAIN MODEL # ############### # print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = 1 # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False errors_for_plot = numpy.zeros(self.n_epochs) while (epoch < self.n_epochs) and (not done_looping): epoch = epoch + 1 # for minibatch_index in xrange(n_train_batches): # minibatch_index = 0 minibatch_avg_cost = self.train_model() # print 'trainig error: ', minibatch_avg_cost if not (self.best_error is None) and minibatch_avg_cost <= self.best_error:#this is for test phase # validation_losses, my_probs = self.validate_model() break elif self.best_error is None: # iteration number iter = (epoch - 1) * n_batches if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses, my_probs = self.validate_model() this_validation_loss = numpy.mean(validation_losses) errors_for_plot[epoch-1] = this_validation_loss # print('epoch %i, minibatch %i/%i, validation error %f %%' % # (epoch, 1, n_batches, # this_validation_loss * 100.)) # # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set # test_losses = [test_model(i) for i # in xrange(n_test_batches)] # test_score = numpy.mean(test_losses) # print((' epoch %i, minibatch %i/%i, test error of ' # 'best model %f %%') % # (epoch, minibatch_index + 1, n_train_batches, # test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() # print(('Optimization complete. Best validation score of %f %% ' # 'obtained at iteration %i') % # (best_validation_loss * 100., best_iter + 1)) # # print >> sys.stderr, ('The code for file ' + # os.path.split(__file__)[1] + # ' ran for %.2fm' % ((end_time - start_time) / 60.)) # # plt.plot(numpy.arange(100), errors_for_plot) # plt.show() return best_validation_loss
def evaluate_lenet5(learning_rate=0.02, n_epochs=100, emb_size=300, batch_size=50, filter_size=[3], sent_len=40, claim_len=40, cand_size=10, hidden_size=[300, 300], max_pred_pick=5): model_options = locals().copy() print "model options", model_options pred_id2label = {1: 'SUPPORTS', 0: 'REFUTES', 2: 'NOT ENOUGH INFO'} seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) "load raw data" train_sents, train_sent_masks, train_sent_labels, train_claims, train_claim_mask, train_labels, word2id = load_fever_train( sent_len, claim_len, cand_size) train_3th_sents, train_3th_sent_masks, train_3th_sent_labels, train_3th_claims, train_3th_claim_mask, train_3th_labels, word2id = load_fever_train_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) test_sents, test_sent_masks, test_sent_labels, test_claims, test_claim_mask, test_sent_names, test_ground_names, test_labels, word2id = load_fever_dev( sent_len, claim_len, cand_size, word2id) test_3th_sents, test_3th_sent_masks, test_3th_sent_labels, test_3th_claims, test_3th_claim_mask, test_3th_labels, word2id = load_fever_dev_NoEnoughInfo( sent_len, claim_len, cand_size, word2id) train_sents = np.asarray(train_sents, dtype='int32') train_3th_sents = np.asarray(train_3th_sents, dtype='int32') joint_train_sents = np.concatenate((train_sents, train_3th_sents)) test_sents = np.asarray(test_sents, dtype='int32') test_3th_sents = np.asarray(test_3th_sents, dtype='int32') joint_test_sents = np.concatenate((test_sents, test_3th_sents)) train_sent_masks = np.asarray(train_sent_masks, dtype=theano.config.floatX) train_3th_sent_masks = np.asarray(train_3th_sent_masks, dtype=theano.config.floatX) joint_train_sent_masks = np.concatenate( (train_sent_masks, train_3th_sent_masks)) test_sent_masks = np.asarray(test_sent_masks, dtype=theano.config.floatX) test_3th_sent_masks = np.asarray(test_3th_sent_masks, dtype=theano.config.floatX) joint_test_sent_masks = np.concatenate( (test_sent_masks, test_3th_sent_masks)) train_sent_labels = np.asarray(train_sent_labels, dtype='int32') train_3th_sent_labels = np.asarray(train_3th_sent_labels, dtype='int32') joint_train_sent_labels = np.concatenate( (train_sent_labels, train_3th_sent_labels)) test_sent_labels = np.asarray(test_sent_labels, dtype='int32') test_3th_sent_labels = np.asarray(test_3th_sent_labels, dtype='int32') joint_test_sent_labels = np.concatenate( (test_sent_labels, test_3th_sent_labels)) train_claims = np.asarray(train_claims, dtype='int32') train_3th_claims = np.asarray(train_3th_claims, dtype='int32') joint_train_claims = np.concatenate((train_claims, train_3th_claims)) test_claims = np.asarray(test_claims, dtype='int32') test_3th_claims = np.asarray(test_3th_claims, dtype='int32') joint_test_claims = np.concatenate((test_claims, test_3th_claims)) train_claim_mask = np.asarray(train_claim_mask, dtype=theano.config.floatX) train_3th_claim_mask = np.asarray(train_3th_claim_mask, dtype=theano.config.floatX) joint_train_claim_mask = np.concatenate( (train_claim_mask, train_3th_claim_mask)) test_claim_mask = np.asarray(test_claim_mask, dtype=theano.config.floatX) test_3th_claim_mask = np.asarray(test_3th_claim_mask, dtype=theano.config.floatX) joint_test_claim_mask = np.concatenate( (test_claim_mask, test_3th_claim_mask)) train_labels = np.asarray(train_labels, dtype='int32') train_3th_labels = np.asarray(train_3th_labels, dtype='int32') joint_train_labels = np.concatenate((train_labels, train_3th_labels)) test_labels = np.asarray(test_labels, dtype='int32') test_3th_labels = np.asarray(test_3th_labels, dtype='int32') joint_test_labels = np.concatenate((test_labels, test_3th_labels)) joint_train_size = len(joint_train_claims) joint_test_size = len(joint_test_claims) train_size = len(train_claims) test_size = len(test_claims) test_3th_size = len(test_3th_claims) vocab_size = len(word2id) + 1 print 'joint_train size: ', joint_train_size, ' joint_test size: ', joint_test_size print 'train size: ', train_size, ' test size: ', test_size print 'vocab size: ', vocab_size rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_word2vec() rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) init_embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable "now, start to build the input form of the model" sents_ids = T.itensor3() #(batch, cand_size, sent_len) sents_mask = T.ftensor3() sents_labels = T.imatrix() #(batch, cand_size) claim_ids = T.imatrix() #(batch, claim_len) claim_mask = T.fmatrix() joint_sents_ids = T.itensor3() #(batch, cand_size, sent_len) joint_sents_mask = T.ftensor3() joint_sents_labels = T.imatrix() #(batch, cand_size) joint_claim_ids = T.imatrix() #(batch, claim_len) joint_claim_mask = T.fmatrix() joint_labels = T.ivector() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' embed_input_sents = init_embeddings[sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM embed_input_claim = init_embeddings[claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) att_conv_W, att_conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) NN_para = [conv_W, conv_b, att_conv_W, att_conv_b, conv_W_context] conv_model_sents = Conv_with_Mask( rng, input_tensor3=embed_input_sents, mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_sent_emb = sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) conv_model_claims = Conv_with_Mask( rng, input_tensor3=embed_input_claim, mask_matrix=claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero claim_embeddings = conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size batch_claim_emb = T.repeat(claim_embeddings.dimshuffle(0, 'x', 1), cand_size, axis=1) ''' attentive conv ''' attentive_conv_layer = Attentive_Conv_for_Pair_easy_version( rng, input_tensor3= embed_input_sents, #batch_size*cand_size, emb_size, sent_len input_tensor3_r=T.repeat(embed_input_claim, cand_size, axis=0), mask_matrix=sents_mask.reshape( (sents_mask.shape[0] * sents_mask.shape[1], sents_mask.shape[2])), mask_matrix_r=T.repeat(claim_mask, cand_size, axis=0), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), image_shape_r=(batch_size * cand_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=att_conv_W, b=att_conv_b, W_context=conv_W_context, b_context=conv_b_context) attentive_sent_embeddings_l = attentive_conv_layer.attentive_maxpool_vec_l #(batch_size*cand_size, hidden_size) attentive_sent_embeddings_r = attentive_conv_layer.attentive_maxpool_vec_r # concate_claim_sent = T.concatenate([batch_claim_emb,batch_sent_emb ], axis=2) # concate_2_matrix = concate_claim_sent.reshape((batch_size*cand_size, hidden_size[0]*2)) concate_claim_sent = T.concatenate([ batch_claim_emb, batch_sent_emb, T.sum(batch_claim_emb * batch_sent_emb, axis=2).dimshuffle(0, 1, 'x') ], axis=2) concate_2_matrix = concate_claim_sent.reshape( (batch_size * cand_size, hidden_size[0] * 2 + 1)) LR_input = T.concatenate([ concate_2_matrix, attentive_sent_embeddings_l, attentive_sent_embeddings_r ], axis=1) LR_input_size = hidden_size[0] * 2 + 1 + hidden_size[0] * 2 #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative U_a = create_ensemble_para( rng, 1, LR_input_size) # the weight matrix hidden_size*2 # LR_b = theano.shared(value=np.zeros((8,),dtype=theano.config.floatX),name='LR_b', borrow=True) #bias for each target class LR_para = [U_a] # layer_LR=LogisticRegression(rng, input=LR_input, n_in=LR_input_size, n_out=8, W=U_a, b=LR_b) #basically it is a multiplication between weight matrix and input feature vector score_matrix = T.nnet.sigmoid(LR_input.dot(U_a)) #batch * 12 inter_matrix = score_matrix.reshape((batch_size, cand_size)) # inter_sent_claim = T.batched_dot(batch_sent_emb, batch_claim_emb) #(batch_size, cand_size, 1) # inter_matrix = T.nnet.sigmoid(inter_sent_claim.reshape((batch_size, cand_size))) ''' maybe 1.0-inter_matrix can be rewritten into 1/e^(inter_matrix) ''' # prob_pos = T.where( sents_labels < 1, 1.0-inter_matrix, inter_matrix) # loss = -T.mean(T.log(prob_pos)) #f1 as loss batch_overlap = T.sum(sents_labels * inter_matrix, axis=1) batch_recall = batch_overlap / T.sum(sents_labels, axis=1) batch_precision = batch_overlap / T.sum(inter_matrix, axis=1) batch_f1 = 2.0 * batch_recall * batch_precision / (batch_recall + batch_precision) loss = -T.mean(T.log(batch_f1)) # loss = T.nnet.nnet.binary_crossentropy(inter_matrix, sents_labels).mean() ''' training task2, predict 3 labels ''' joint_embed_input_sents = init_embeddings[joint_sents_ids.flatten( )].reshape((batch_size * cand_size, sent_len, emb_size)).dimshuffle( 0, 2, 1 ) #embed_input(init_embeddings, sents_ids_l)#embeddings[sents_ids_l.flatten()].reshape((batch_size,maxSentLen, emb_size)).dimshuffle(0,2,1) #the input format can be adapted into CNN or GRU or LSTM joint_embed_input_claim = init_embeddings[ joint_claim_ids.flatten()].reshape( (batch_size, claim_len, emb_size)).dimshuffle(0, 2, 1) joint_conv_model_sents = Conv_with_Mask( rng, input_tensor3=joint_embed_input_sents, mask_matrix=joint_sents_mask.reshape( (joint_sents_mask.shape[0] * joint_sents_mask.shape[1], joint_sents_mask.shape[2])), image_shape=(batch_size * cand_size, 1, emb_size, sent_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_sent_embeddings = joint_conv_model_sents.maxpool_vec #(batch_size*cand_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_batch_sent_emb = joint_sent_embeddings.reshape( (batch_size, cand_size, hidden_size[0])) joint_premise_emb = T.sum(joint_batch_sent_emb * joint_sents_labels.dimshuffle(0, 1, 'x'), axis=1) #(batch, hidden_size) joint_conv_model_claims = Conv_with_Mask( rng, input_tensor3=joint_embed_input_claim, mask_matrix=joint_claim_mask, image_shape=(batch_size, 1, emb_size, claim_len), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero joint_claim_embeddings = joint_conv_model_claims.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size joint_premise_hypo_emb = T.concatenate( [joint_premise_emb, joint_claim_embeddings], axis=1) #(batch, 2*hidden_size) "Logistic Regression layer" joint_LR_input = joint_premise_hypo_emb #T.concatenate([attentive_sent_embeddings_l,attentive_sent_embeddings_r,attentive_sent_embeddings_l+attentive_sent_embeddings_r,attentive_sent_embeddings_l*attentive_sent_embeddings_r],axis=1) joint_LR_input_size = 2 * hidden_size[0] joint_U_a = create_ensemble_para(rng, 3, joint_LR_input_size) # (input_size, 3) joint_LR_b = theano.shared(value=np.zeros((3, ), dtype=theano.config.floatX), name='LR_b', borrow=True) #bias for each target class joint_LR_para = [joint_U_a, joint_LR_b] joint_layer_LR = LogisticRegression( rng, input=joint_LR_input, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector joint_loss = joint_layer_LR.negative_log_likelihood( joint_labels ) #for classification task, we usually used negative log likelihood as loss, the lower the better. ''' testing ''' # binarize_prob = T.where( inter_matrix > 0.5, 1, 0) #(batch_size, cand_size masked_inter_matrix = inter_matrix * sents_labels #(batch, cand_size) test_premise_emb = T.sum(batch_sent_emb * masked_inter_matrix.dimshuffle(0, 1, 'x'), axis=1) test_premise_hypo_emb = T.concatenate([test_premise_emb, claim_embeddings], axis=1) test_layer_LR = LogisticRegression( rng, input=test_premise_hypo_emb, n_in=joint_LR_input_size, n_out=3, W=joint_U_a, b=joint_LR_b ) #basically it is a multiplication between weight matrix and input feature vector params = [init_embeddings] + NN_para + LR_para + joint_LR_para cost = loss + joint_loss "Use AdaGrad to update parameters" updates = Gradient_Cost_Para(cost, params, learning_rate) train_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_sents_ids, joint_sents_mask, joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels ], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') # dev_model = theano.function([sents_ids_l, sents_mask_l, sents_ids_r, sents_mask_r, labels], layer_LR.errors(labels), allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function([ sents_ids, sents_mask, sents_labels, claim_ids, claim_mask, joint_labels ], [ inter_matrix, test_layer_LR.errors(joint_labels), test_layer_LR.y_pred ], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False joint_n_train_batches = joint_train_size / batch_size joint_train_batch_start = list( np.arange(joint_n_train_batches) * batch_size) + [joint_train_size - batch_size] n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_test_batches = test_size / batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] n_test_3th_batches = test_3th_size / batch_size test_3th_batch_start = list(np.arange(n_test_3th_batches) * batch_size) + [test_3th_size - batch_size] max_acc = 0.0 max_test_f1 = 0.0 max_acc_full_evi = 0.0 cost_i = 0.0 joint_train_indices = range(joint_train_size) train_indices = range(train_size) while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle( joint_train_indices ) #shuffle training set for each new epoch, is supposed to promote performance, but not garrenteed random.Random(100).shuffle(train_indices) iter_accu = 0 for joint_batch_id in joint_train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * joint_n_train_batches + iter_accu + 1 iter_accu += 1 joint_train_id_batch = joint_train_indices[ joint_batch_id:joint_batch_id + batch_size] for i in range(3): batch_id = random.choice(train_batch_start) train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_model( train_sents[train_id_batch], train_sent_masks[train_id_batch], train_sent_labels[train_id_batch], train_claims[train_id_batch], train_claim_mask[train_id_batch], #joint_sents_ids,joint_sents_mask,joint_sents_labels, joint_claim_ids, joint_claim_mask, joint_labels joint_train_sents[joint_train_id_batch], joint_train_sent_masks[joint_train_id_batch], joint_train_sent_labels[joint_train_id_batch], joint_train_claims[joint_train_id_batch], joint_train_claim_mask[joint_train_id_batch], joint_train_labels[joint_train_id_batch]) #after each 1000 batches, we test the performance of the model on all test data # if (epoch==1 and iter%1000==0) or (epoch>=2 and iter%5==0): if iter % 100 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), 'uses ', ( time.time() - past_time) / 60.0, 'min' past_time = time.time() f1_sum = 0.0 error_sum = 0.0 full_evi = 0 predictions = [] for test_batch_id in test_batch_start: # for each test batch batch_prob, error_i, pred_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_sent_masks[test_batch_id:test_batch_id + batch_size], test_sent_labels[test_batch_id:test_batch_id + batch_size], test_claims[test_batch_id:test_batch_id + batch_size], test_claim_mask[test_batch_id:test_batch_id + batch_size], test_labels[test_batch_id:test_batch_id + batch_size]) error_sum += error_i batch_sent_labels = test_sent_labels[ test_batch_id:test_batch_id + batch_size] batch_sent_names = test_sent_names[ test_batch_id:test_batch_id + batch_size] batch_ground_names = test_ground_names[ test_batch_id:test_batch_id + batch_size] batch_ground_labels = test_labels[ test_batch_id:test_batch_id + batch_size] for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get( batch_ground_labels[i]) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) pred_sent_names = [] gold_sent_names = batch_ground_names[i] zipped = [(batch_prob[i, k], batch_sent_labels[i][k], batch_sent_names[i][k]) for k in range(cand_size)] sorted_zip = sorted(zipped, key=lambda x: x[0], reverse=True) for j in range(cand_size): triple = sorted_zip[j] if triple[1] == 1.0: ''' we should consider a rank, instead of binary if triple[0] >0.5: can control the recall, influence the strict_acc ''' if triple[0] > 0.5: # pred_sent_names.append(batch_sent_names[i][j]) pred_sent_names.append(triple[2]) # if len(pred_sent_names) == max_pred_pick: # break instance_i['predicted_evidence'] = pred_sent_names # print 'pred_sent_names:',pred_sent_names # print 'gold_sent_names:',gold_sent_names new_gold_names = [] for gold_name in gold_sent_names: new_gold_names.append([None, None] + gold_name) instance_i['evidence'] = [new_gold_names] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 # test_f1=f1_sum/(len(test_batch_start)*batch_size) for test_batch_id in test_3th_batch_start: # for each test batch _, error_i, pred_i = test_model( test_3th_sents[test_batch_id:test_batch_id + batch_size], test_3th_sent_masks[test_batch_id:test_batch_id + batch_size], test_3th_sent_labels[test_batch_id:test_batch_id + batch_size], test_3th_claims[test_batch_id:test_batch_id + batch_size], test_3th_claim_mask[test_batch_id:test_batch_id + batch_size], test_3th_labels[test_batch_id:test_batch_id + batch_size]) for i in range(batch_size): instance_i = {} instance_i['label'] = pred_id2label.get(2) instance_i['predicted_label'] = pred_id2label.get( pred_i[i]) instance_i['predicted_evidence'] = [] instance_i['evidence'] = [] predictions.append(instance_i) strict_score, label_accuracy, precision, recall, f1 = fever_score( predictions) print 'strict_score, label_accuracy, precision, recall, f1: ', strict_score, label_accuracy, precision, recall, f1 print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) return max_acc_test
def __init__(self, rng, input, n_in, n_hidden, n_out): """Initialize the parameters for the multilayer perceptron :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type input: theano.tensor.TensorType :param input: symbolic variable that describes the input of the architecture (one minibatch) :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_hidden: int :param n_hidden: number of hidden units :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie """ # Since we are dealing with a one hidden layer MLP, this will translate # into a HiddenLayer with a tanh activation function connected to the # LogisticRegression layer; the activation function can be replaced by # sigmoid or any other nonlinear function self.hiddenLayer = HiddenLayer( rng=rng, input=input, n_in=n_in, n_out=n_hidden, activation=T.tanh ) # The logistic regression layer gets as input the hidden units # of the hidden layer self.logRegressionLayer = LogisticRegression( input=self.hiddenLayer.output, n_in=n_hidden, n_out=n_out ) # end-snippet-2 start-snippet-3 # L1 norm ; one regularization option is to enforce L1 norm to # be small self.L1 = ( abs(self.hiddenLayer.W).sum() + abs(self.logRegressionLayer.W).sum() ) # square of L2 norm ; one regularization option is to enforce # square of L2 norm to be small self.L2_sqr = ( (self.hiddenLayer.W ** 2).sum() + (self.logRegressionLayer.W ** 2).sum() ) # negative log likelihood of the MLP is given by the negative # log likelihood of the output of the model, computed in the # logistic regression layer self.negative_log_likelihood = ( self.logRegressionLayer.negative_log_likelihood ) # same holds for the function computing the number of errors self.errors = self.logRegressionLayer.errors # the parameters of the model are the parameters of the two layer it is # made out of self.params = self.hiddenLayer.params + self.logRegressionLayer.params # end-snippet-3 # keep track of model input self.input = input
def __init__(self, rng, model_input, image_shape=(3, 32, 32), filter_shape=(5, 5), poolsize=(2, 2), batch_size=100, nkerns=(20, 50), n_in=400, n_out=10, temperature=1, dropout_ps=[0.0, 0.0, 0.0, 0.0]): layer0_input = model_input.reshape((batch_size, ) + image_shape) layer0_input_dropout = _dropout_from_layer(rng, layer0_input, p=dropout_ps[0]) self.layer0_dropout = DropoutLenetConvPoolLayer( rng, input=layer0_input_dropout, image_shape=(batch_size, ) + image_shape, filter_shape=(nkerns[0], ) + (image_shape[0], ) + filter_shape, poolsize=poolsize, dropout_p=dropout_ps[1]) self.layer0 = LeNetConvPoolLayer( rng, input=layer0_input, W=self.layer0_dropout.W * (1 - dropout_ps[0]), b=self.layer0_dropout.b, image_shape=(batch_size, ) + image_shape, filter_shape=(nkerns[0], ) + (image_shape[0], ) + filter_shape, poolsize=poolsize, ) self.layer1_dropout = DropoutLenetConvPoolLayer( rng, input=self.layer0_dropout.output, image_shape=(batch_size, ) + (nkerns[0], ) + (14, 14), filter_shape=(nkerns[1], nkerns[0]) + (5, 5), poolsize=poolsize, dropout_p=dropout_ps[2], ) self.layer1 = LeNetConvPoolLayer( rng, input=self.layer0.output, W=self.layer1_dropout.W * (1 - dropout_ps[1]), b=self.layer1_dropout.b, image_shape=(batch_size, ) + (nkerns[0], ) + (14, 14), filter_shape=(nkerns[1], nkerns[0]) + (5, 5), poolsize=poolsize, ) self.layer2_dropout = DropoutHiddenLayer( rng, input=self.layer1_dropout.output.flatten(2), n_in=nkerns[1] * 5 * 5, n_out=n_in, activation=T.tanh, dropout_p=dropout_ps[3], ) self.layer2 = HiddenLayer( rng, input=self.layer1.output.flatten(2), W=self.layer2_dropout.W * (1 - dropout_ps[2]), b=self.layer2_dropout.b, n_in=nkerns[1] * 5 * 5, n_out=n_in, activation=T.tanh, ) self.logRegressionLayer_dropout = LogisticRegression( input=self.layer2_dropout.output, n_in=n_in, n_out=n_out, temperature=temperature) self.logRegressionLayer = LogisticRegression( input=self.layer2.output, W=self.logRegressionLayer_dropout.W * (1 - dropout_ps[3]), b=self.logRegressionLayer_dropout.b, n_in=n_in, n_out=n_out, temperature=temperature) # self.L1 = ( # abs(self.layer1_dropout.W).sum() # + abs(self.layer2_dropout.W).sum() # + abs(self.logRegressionLayer.W_dropout).sum() # ) # # self.L2_sqr = (self.layer1.W ** 2).sum() + \ # (self.layer2.W ** 2).sum() + \ # (self.logRegressionLayer.W ** 2).sum() self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood self.negative_log_likelihood_dropout = self.logRegressionLayer_dropout.negative_log_likelihood self.errors = self.logRegressionLayer.errors self.errors_dropout = self.logRegressionLayer_dropout.errors self.params = (self.logRegressionLayer_dropout.params + self.layer2_dropout.params + self.layer1_dropout.params + self.layer0_dropout.params) self.p_y_given_x = self.logRegressionLayer.p_y_given_x self.p_y_given_x_relaxed = self.logRegressionLayer.p_y_given_x_relaxed