def evaluate_mnist_1(learning_rate=0.1, n_epochs=100, nkerns=[4, 6], batch_size=2): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(3) xs = [] ys = [] # f = open('temp_value', 'r+') # f = open('out_10', 'r+') f = open('out_10_10', 'r+') while (1): line = f.readline() line2 = f.readline() if not line: break line = line.replace("\n", "") values = [float(i) for i in line.split()] value = float(line2) xs.append(values) ys.append(value) print(len(xs)) print(len(xs[0])) print(len(ys)) # print(ys) # print(xs) test_set_x, test_set_y = shared_dataset([xs, ys]) valid_set_x, valid_set_y = shared_dataset([xs, ys]) train_set_x, train_set_y = shared_dataset([xs, ys]) # train_set_x, train_set_y = datasets[0] # valid_set_x, valid_set_y = datasets[1] # test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing batch_size = len(ys) # batch_size=1 n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # n_train_batches = 1 # n_valid_batches = 1 # n_test_batches = 1 # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # myprint=theano.function([x],x) # myprint([layer2_input]) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=20, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=20, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) prob = layer3.prob_y_given_x(y) f1 = open('weights', 'w+') print "layer 0 weights" for w in layer0.W.get_value(): for r in w: for s in r: for d in s: f1.write(str(d) + '\n') # print layer0.W.get_value() # print layer0.b.get_value() print "layer 1 weights" # print layer1.W.get_value() # print layer1.b.get_value() for w in layer1.W.get_value(): for r in w: for s in r: for d in s: f1.write(str(d) + '\n') print "layer 2 weights" # print layer2.W.get_value() w = layer2.W.get_value() # for d in w: # print d for i in range(len(w[0])): for j in range(len(w)): f1.write(str(w[j][i]) + '\n') # print layer2.b.get_value() # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) prob_model = theano.function( [index], prob, givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) conv_model0 = theano.function( [index], layer0.output, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model0_conv = theano.function( [index], layer0.conv_out, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model1 = theano.function( [index], layer1.output, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model1_conv = theano.function( [index], layer1.conv_out, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) conv_model2 = theano.function( [index], layer2.output, givens={x: valid_set_x[index * batch_size:(index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # params = layer0.params + layer1.params + layer2.params + layer3.params # x_printed = theano.printing.Print('this is a very important value')(x) # f_with_print = theano.function([x], x_printed) # f_with_print(layer3.params) # create a list of gradients for all model parameters grads = T.grad(cost, params) val_grads = T.grad(cost, layer3.p_y_given_x) # print "AAAA" # theano.printing.debugprint(temp_grads) # print "AAAA" grad_model = theano.function( [index], grads, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) val_grad_model = theano.function( [index], val_grads, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False bestConvW = layer0.W.get_value() while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index val_grads_ij = val_grad_model(minibatch_index) grads_ij = grad_model(minibatch_index) conv0_ij = conv_model0(minibatch_index) conv1_ij = conv_model1(minibatch_index) conv2_ij = conv_model2(minibatch_index) conv0_conv_ij = conv_model0_conv(minibatch_index) conv1_conv_ij = conv_model1_conv(minibatch_index) print 'training @ iter = ', iter print "last layer var grads" print val_grads_ij[0] # print "Layer 0 convolution" # for c in conv0_conv_ij[0]: # print c # print "" # print "" # print "Layer 1 convolution" # for c in conv1_conv_ij[0]: # print c # print "" # print "" probs = prob_model(minibatch_index) print "Probs" print probs # print "layer 0 grads" # print grads_ij[6] # print grads_ij[7] # print "layer 1 grads" # print grads_ij[4] # print grads_ij[5] # print "layer 2 grads" # print grads_ij[2] # print grads_ij[3] print "log reg layer grads" print grads_ij[0] print grads_ij[1] print "Layer 0 output" # for c in conv0_ij: # for d in c: # print d # print conv0_ij[0][0] print "Layer 1 output" # print conv1_ij[0][0] # for c in conv1_ij: # for d in c: # print d print "Layer 2 output" # for c in conv2_ij: # print c cost_ij = train_model(minibatch_index) # for c in conv0_conv_ij[1]: # print c # print "" print "learning_rate" print learning_rate print "layer 0 weights" # print layer0.W.get_value() # print layer0.b.get_value() print "layer 1 weights" # print layer1.W.get_value() # print layer1.b.get_value() print "layer 2 weights" w = layer2.W.get_value() # print w[0] # print w[1] # for c in layer2.W.get_value(): # print c # print layer2.b.get_value() print "log reg layer weights" print layer3.W.get_value() print layer3.b.get_value() print "COST" print cost_ij if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: bestConvW = layer0.W.get_value() #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print( (' epoch %i, minibatch %i/%i, test error of best ' 'model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, D, M, Q, Domain_number, Hiddenlayerdim1, Hiddenlayerdim2): self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') N = self.X.shape[0] self.Weight = T.matrix('Weight') ker = kernel(Q) #mmd=MMD(M,Domain_number) mu_value = np.random.randn(M, D) * 1e-2 Sigma_b_value = np.zeros((M, M)) # + np.log(0.01) Z_value = np.random.randn(M, Q) ls_value = np.zeros(Domain_number) + np.log(0.1) self.mu = theano.shared(value=mu_value, name='mu', borrow=True) self.Sigma_b = theano.shared(value=Sigma_b_value, name='Sigma_b', borrow=True) self.Z = theano.shared(value=Z_value, name='Z', borrow=True) self.ls = theano.shared(value=ls_value, name='ls', borrow=True) self.params = [self.mu, self.Sigma_b, self.Z, self.ls] self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D, n_out=Hiddenlayerdim1, activation=T.nnet.relu, number='_x') #self.hiddenLayer_hidden = HiddenLayer(rng=rng,input=self.hiddenLayer_x.output,n_in=Hiddenlayerdim1,n_out=Hiddenlayerdim2,activation=T.nnet.relu,number='_h') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Q, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Q, activation=T.nnet.relu, number='_S') self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) #self.loc_params.extend(self.hiddenLayer_hidden.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i self.params.extend(ker.params) #self.params.extend(mmd.params) self.hyp_params = {} for i in [self.mu, self.Sigma_b, self.ls]: self.hyp_params[str(i)] = i self.Z_params = {} for i in [self.Z]: self.Z_params[str(i)] = i self.global_params = {} for i in self.params: self.global_params[str(i)] = i self.params.extend(self.hiddenLayer_x.params) #self.params.extend(self.hiddenLayer_hidden.params) self.params.extend(self.hiddenLayer_m.params) self.params.extend(self.hiddenLayer_S.params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i m = self.hiddenLayer_m.output S_0 = self.hiddenLayer_S.output S_1 = T.exp(S_0) S = T.sqrt(S_1) from theano.tensor.shared_randomstreams import RandomStreams srng = RandomStreams(seed=234) eps_NQ = srng.normal((N, Q)) eps_M = srng.normal((M, D)) #平均と分散で違う乱数を使う必要があるので別々に銘銘 eps_ND = srng.normal((N, D)) beta = T.exp(self.ls) #uについては対角でないのでコレスキー分解するとかして三角行列を作る必要がある Sigma = T.tril(self.Sigma_b - T.diag(T.diag(self.Sigma_b)) + T.diag(T.exp(T.diag(self.Sigma_b)))) #スケール変換 mu_scaled, Sigma_scaled = ker.sf2**0.5 * self.mu, ker.sf2**0.5 * Sigma Xtilda = m + S * eps_NQ self.U = mu_scaled + Sigma_scaled.dot(eps_M) Kmm = ker.RBF(self.Z) #Kmm=mmd.MMD_kenel_Xonly(mmd.Zlabel_T,Kmm,self.Weight) KmmInv = sT.matrix_inverse(Kmm) Kmn = ker.RBF(self.Z, Xtilda) #Kmn=mmd.MMD_kenel_ZX(self.Xlabel,Kmn,self.Weight) Knn = ker.RBF(Xtilda) #Knn=mmd.MMD_kenel_Xonly(self.Xlabel,Knn,self.Weight) Ktilda = Knn - T.dot(Kmn.T, T.dot(KmmInv, Kmn)) #F = T.dot(Kmn.T,T.dot(KmmInv,self.U)) + T.dot(T.maximum(Ktilda, 1e-16)**0.5,eps_ND) Kinterval = T.dot(KmmInv, Kmn) A = Kinterval.T Sigma_tilda = Ktilda + T.dot(A, T.dot(Sigma_scaled, A.T)) mean_tilda = T.dot(A, mu_scaled) #mean_U=F #mean_U=T.dot(Kinterval.T,self.U) mean_U = mean_tilda + T.dot(T.maximum(Sigma_tilda, 1e-16)**0.5, eps_ND) betaI = T.diag(T.dot(self.Xlabel, beta)) Covariance = betaI self.LL = self.log_mvn( self.X, mean_U, Covariance) / N # - 0.5*T.sum(T.dot(betaI,Ktilda)) self.KL_X = -self.KLD_X(m, S) self.KL_U = -self.KLD_U(mu_scaled, Sigma_scaled, Kmm, KmmInv)
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset=DataSet, nkerns=[cls1, cls2], batch_size=100): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print type(train_set_x) #train_set_x.set_value(train_set_x.get_value(borrow=True)[:,:540]) #valid_set_x.set_value(valid_set_x.get_value(borrow=True)[:,:540]) #test_set_x.set_value(test_set_x.get_value(borrow=True)[:,:540]) #train_set_x = train_set_x / 100 #valid_set_x = valid_set_x / 100 #test_set_x = test_set_x / 100 # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size #n_test_batches /= batch_size n_test_batches = (n_test_batches / batch_size) + (n_test_batches % batch_size > 0) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch Alr = T.scalar('Alr') x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (27, 10) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer xinp = x[:, :540] layer0_input = xinp.reshape((batch_size, 2, 27, 10)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 2, 27, 10), filter_shape=(nkerns[0], 2, fsx, fsy), poolsize=(p1, p1)) cl2x = (27 - fsx + 1) / p1 cl2y = (10 - fsy + 1) / p1 # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], cl2x, cl2y), filter_shape=(nkerns[1], nkerns[0], fsx, fsy), poolsize=(p2, p2)) hl1 = ((cl2x - fsx + 1) / p2) * ((cl2y - fsy + 1) / p2) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) layer2_inputT = T.concatenate([layer2_input, x[:, 540:]], axis=1) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_inputT, n_in=(nkerns[1] * hl1 * 1) + 12, n_out=nhu1, activation=T.tanh) layer22 = HiddenLayer(rng, input=layer2.output, n_in=nhu1, n_out=nhu1, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer22.output, n_in=nhu1, n_out=n_out) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) #yPred = layer3.ypred(layer2.output) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], [layer3.errors(y), layer3.y_pred], givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer22.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): #updates.append((param_i, param_i - learning_rate * grad_i)) updates.append((param_i, param_i - Alr * grad_i)) train_model = theano.function( [index, Alr], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size][:], y: train_set_y[index * batch_size:(index + 1) * batch_size][:] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch #best_params = None best_params = [] best_validation_loss = numpy.inf prev_validation_loss = 200 best_iter = 0 test_score = 0. start_time = time.clock() Alrc = 0.1 AlrE = 0.00001 epochC = 0 epoch = 0 done_looping = False for param in params: best_params.append(param.get_value()) while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 epochC = epochC + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index, Alrc) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) lossratio = (this_validation_loss - prev_validation_loss) / (prev_validation_loss + 1) print(lossratio) print('epoch %i, minibatch %i/%i, validation error %f, lr %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, \ this_validation_loss * 100., Alrc)) # if we got the best validation score until now #if this_validation_loss < best_validation_loss: if lossratio <= 0.0: for i in range(len(params)): best_params[i] = params[i].get_value() #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss prev_validation_loss = this_validation_loss best_iter = iter # test it on the test set #tm = test_model(0) yP = numpy.asarray([]) test_losses = [ test_model(i)[0] for i in xrange(n_test_batches) ] for i in xrange(n_test_batches): yP = numpy.concatenate((yP, test_model(i)[1])) print yP.shape test_score = numpy.mean(test_losses) #yP = yPred#yPred(layer2.output.owner.inputs[0].get_value()) y = test_set_y.owner.inputs[0].get_value() I1 = numpy.nonzero(y == 0.0) I2 = numpy.nonzero(y == 1.0) I3 = numpy.nonzero(y == 2.0) I11 = numpy.nonzero(yP[I1[0]] == 0) I12 = numpy.nonzero(yP[I1[0]] == 1) I13 = numpy.nonzero(yP[I1[0]] == 2) I21 = numpy.nonzero(yP[I2[0]] == 0) I22 = numpy.nonzero(yP[I2[0]] == 1) I23 = numpy.nonzero(yP[I2[0]] == 2) I31 = numpy.nonzero(yP[I3[0]] == 0) I32 = numpy.nonzero(yP[I3[0]] == 1) I33 = numpy.nonzero(yP[I3[0]] == 2) acc1 = float(float(I11[0].size) / float(I1[0].size)) acc2 = float(float(I22[0].size) / float(I2[0].size)) if n_out == 3: acc3 = float(float(I33[0].size) / float(I3[0].size)) else: acc3 = 0 print(( ' epoch %i, minibatch %i/%i, test error of ' 'best model %f, acc1 = %f, acc2 = %f, acc3 = %f, I11 = %i, I12 = %i, I13 = %i, I21 = %i, I22 = %i, I23 = %i, I31 = %i, I32 = %i, I33 = %i %%' ) % (epoch, minibatch_index + 1, n_train_batches, test_score * 100., acc1 * 100., acc2 * 100., acc3 * 100, I11[0].size, I12[0].size, I13[0].size, I21[0].size, I22[0].size, I23[0].size, I31[0].size, I32[0].size, I33[0].size)) #print((' epoch %i, minibatch %i/%i, test error of best ' # 'model %f %%') % # (epoch, minibatch_index + 1, n_train_batches, # test_score * 100.)) else: if Alrc <= AlrE: done_looping = True break elif epochC > 40: Alrc = Alrc / 2 for param, best_param in zip(params, best_params): param.set_value(best_param) epochC = 0 #if patience <= iter: # done_looping = True # break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size index = T.lscalar() x = T.matrix('x') y = T.ivector('y') # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer(rng=rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer(rng=rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) layer2 = HiddenLayer(rng=rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=300, activation=T.tanh) layer3 = LogisticRegression(input=layer2.output, n_in=300, n_out=10) cost = layer3.negative_log_likelihood(y) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) params = layer3.params + layer2.params + layer1.params + layer0.params grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) patience = 10000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = numpy.inf best_iter = 0 epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter fo = open('best_cnn_model.pkl', 'wb') pickle.dump([[layer0.W, layer0.b], [layer1.W, layer1.b], [layer2.W, layer2.b], [layer3.W, layer3.b]], fo) fo.close() if patience <= iter: done_looping = True break print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i, ' % (best_validation_loss * 100., best_iter + 1))
def sgd_optimization_mnist(learning_rate=2e-2, loss_weight = 1.8e+8, curriculum_rate=0.1, n_curriculum_epochs=300, epoch_iters = 20, converge = 1e-4, minibatch_size = 50, batch_size=4, k = 4, func = 'concavefeature', func_parameter = 0.5, deep = True): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ print('loading data...') datasets = load_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] labels_, cluster_centers_, center_nn = datasets[3] num_cluster = cluster_centers_.shape[0] isize = int(numpy.sqrt(train_set_x.get_value(borrow=True).shape[1])) # compute number of minibatches for training, validation and testing n_train = train_set_x.get_value(borrow=True).shape[0] n_train_batches = n_train // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('building the model...') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch cindex = T.lvector() # index to a [mini]batch # generate symbolic variables for input (x and y represent a # minibatch) x = T.matrix('x') # data, presented as rasterized images y = T.ivector('y') # labels, presented as 1D vector of [int] labels if deep is False: # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=isize**2, n_out=10) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) cost_vec = classifier.negative_log_likelihood_vec(y) # compute the gradient of cost with respect to theta = (W,b) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) # start-snippet-3 # specify how to update the parameters of the model as a list of # (variable, update expression) pairs. updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] else: nfea = 500 nkerns=[20, 50] n_channels = 1 rng = numpy.random.RandomState(23455) layer0_input = x.reshape((-1, 1, isize, isize)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer( rng, input=layer0_input, image_shape=(None, 1, isize, isize), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2) ) isize1 = int((isize - 5 + 1)/2) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer( rng, input=layer0.output, image_shape=(None, nkerns[0], isize1, isize1), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2) ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) isize2 = int((isize1 - 5 + 1)/2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[1] * isize2 * isize2, n_out=nfea, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer classifier = LogisticRegression(input=layer2.output, n_in=nfea, n_out=10) # the cost we minimize during training is the NLL of the model cost = classifier.negative_log_likelihood(y) cost_vec = classifier.negative_log_likelihood_vec(y) # create a list of all model parameters to be fit by gradient descent params = classifier.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size] } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size] } ) # compiling a Theano function `train_model` that returns the cost, but in # the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[cindex], outputs=classifier.errors(y), updates=updates, givens={ x: train_set_x[cindex], y: train_set_y[cindex] } ) loss_model = theano.function( inputs=[cindex], outputs=cost_vec, givens={ x: train_set_x[cindex], y: train_set_y[cindex] } ) error_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size] } ) # end-snippet-3 ############### # TRAIN MODEL # ############### print('training the model...') # early-stopping parameters patience = 5000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant #validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() #initialize minGain, sinGain, optSubmodular = initSubmodularFunc(cluster_centers_, k) real_iter = 0 validation_frequency = 100 old_epoch_all_loss = float('inf') loss_weight0 = loss_weight passed_index = numpy.array([]) passed_index_epoch = numpy.array([]) passes = 0 output_seq = () for curriculum_epoch in range(n_curriculum_epochs): print('Epoch', curriculum_epoch) old_all_loss = 0 for iters in range(epoch_iters): if len(passed_index) <= n_train*0.45: # compute loss loss_vec = loss_model(center_nn) * loss_weight / len(center_nn) all_loss = sum(loss_vec) #loss_vec_center = numpy.asarray([sum(loss_vec[labels_ == i]) for i in range(num_cluster)]) loss_vec_center = loss_vec topkLoss = sum(numpy.partition(loss_vec_center, -k)[-k:]) optObj = optSubmodular + topkLoss print(optSubmodular, topkLoss) # update A (topkIndex) left_index = pruneGroundSet(minGain, sinGain, loss_vec_center, k) topkIndex = modularLowerBound(cluster_centers_[left_index,:], k, func, func_parameter, loss_vec_center[left_index], optObj) topkIndex = left_index[topkIndex] # update classifier (train_model) train_index = numpy.array([]) for i in range(len(topkIndex)): train_index = numpy.append(train_index, numpy.where(labels_ == topkIndex[i])[0]) train_index = numpy.random.permutation(train_index.astype(int)) print('number of training samples =', len(train_index)) passes += len(train_index) passed_index = numpy.unique(numpy.append(passed_index, train_index)) passed_index_epoch = numpy.unique(numpy.append(passed_index_epoch, train_index)) else: train_index = numpy.random.permutation(numpy.setxor1d(numpy.arange(n_train), passed_index_epoch).astype(int)) #train_index = numpy.random.permutation(numpy.arange(n_train).astype(int)) passes += len(train_index) passed_index_epoch = numpy.array([]) #passed_index = numpy.arange(n_train) # training by mini-batch sgd start_index = 0 train_loss = numpy.array([]) while start_index < len(train_index): end_index = min([start_index + minibatch_size, len(train_index)]) batch_index = train_index[start_index : end_index] start_index = end_index train_loss = numpy.append(train_loss, train_model(batch_index)) this_train_loss = numpy.mean(train_loss) # stop the current epoch if converge diff_loss = old_all_loss - all_loss if diff_loss >= 0 and diff_loss <= all_loss * converge: break # show validation and test error peoriodically else: old_all_loss = all_loss if (iters + real_iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) train_score = [error_model(i) for i in range(n_train_batches)] this_train_score = numpy.mean(train_score) print( 'minibatch %i, %i trainings, %i passes, trainErr %f %%, validErr %f %%, testErr %f %%' % ( iters + real_iter + 1, len(passed_index), passes, this_train_score * 100., this_validation_loss * 100., test_score * 100. ) ) output_seq = output_seq + (numpy.array([len(passed_index),passes,this_train_score * 100.,this_validation_loss * 100.,test_score * 100.]),) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, (iters + real_iter + 1) * patience_increase) best_validation_loss = this_validation_loss # save the best model with open('best_model.pkl', 'wb') as f: pickle.dump(classifier, f) #print('Up to now %i training samples are used'%(len(passed_index))) # record total number of iterations real_iter += iters # adjust learning rate if all_loss > 1.001 * old_epoch_all_loss: print('no improvement: reduce learning rate!') learning_rate *= 0.96 old_epoch_all_loss = all_loss # increase curriculum rate loss_weight *= curriculum_rate + 1 if patience <= iters + real_iter + 1: break end_time = timeit.default_timer() print( ( 'Optimization complete with best validation score of %f %%,' 'with test performance %f %%' ) % (best_validation_loss * 100., test_score * 100.) ) #print('The code run for %d epochs, with %f epochs/sec' % ( #epoch, 1. * epoch / (end_time - start_time))) #print(('The code for file ' + #os.path.split(__file__)[1] + #' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) output_seq = numpy.vstack(output_seq) return output_seq
def test_dA_joint(learning_rate=0.01, training_epochs=15000, dataset='mnist.pkl.gz', batch_size=5, output_folder='dA_plots'): """ This demo is tested on MNIST :type learning_rate: float :param learning_rate: learning rate used for training the DeNosing AutoEncoder :type training_epochs: int :param training_epochs: number of epochs used for training :type dataset: string :param dataset: path to the picked dataset """ ##datasets = load_data(dataset) #from SdA_mapping import load_data_half #datasets = load_data_half(dataset) print 'loading data' datasets, x_mean, y_mean, x_std, y_std = load_vc() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] print 'loaded data' # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x1 = T.matrix('x1') # the data is presented as rasterized images x2 = T.matrix('x2') # the data is presented as rasterized images cor_reg = T.scalar('cor_reg') if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) #################################### # BUILDING THE MODEL NO CORRUPTION # #################################### rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) #da = dA_joint( #numpy_rng=rng, #theano_rng=theano_rng, #input1=x1, #input2=x2, #n_visible1=28 * 28/2, #n_visible2=28 * 28/2, #n_hidden=500 #) print 'initialize functions' da = dA_joint( numpy_rng=rng, theano_rng=theano_rng, input1=x1, input2=x2, cor_reg=cor_reg, #n_visible1=28 * 28/2, #n_visible2=28 * 28/2, n_visible1=24, n_visible2=24, n_hidden=50) cost, updates = da.get_cost_updates(corruption_level=0.3, learning_rate=learning_rate) cor_reg_val = numpy.float32(5.0) train_da = theano.function( [index], cost, updates=updates, givens={ x1: train_set_x[index * batch_size:(index + 1) * batch_size], x2: train_set_y[index * batch_size:(index + 1) * batch_size] }) fprop_x1 = theano.function([], outputs=da.output1, givens={x1: test_set_x}, name='fprop_x1') fprop_x2 = theano.function([], outputs=da.output2, givens={x2: test_set_y}, name='fprop_x2') fprop_x1t = theano.function([], outputs=da.output1, givens={x1: train_set_x}, name='fprop_x1') fprop_x2t = theano.function([], outputs=da.output2, givens={x2: train_set_y}, name='fprop_x2') rec_x1 = theano.function([], outputs=da.rec1, givens={x1: test_set_x}, name='rec_x1') rec_x2 = theano.function([], outputs=da.rec2, givens={x2: test_set_y}, name='rec_x2') fprop_x1_to_x2 = theano.function([], outputs=da.reg, givens={x1: test_set_x}, name='fprop_x12x2') updates_reg = [(da.cor_reg, da.cor_reg + theano.shared(numpy.float32(0.1))) ] update_reg = theano.function([], updates=updates_reg) print 'initialize functions ended' start_time = time.clock() ############ # TRAINING # ############ print 'training started' X1 = test_set_x.eval() X1 *= x_std X1 += x_mean X2 = test_set_y.eval() X2 *= y_std X2 += y_mean from dcca_numpy import cor_cost # go through training epochs for epoch in xrange(training_epochs): # go through trainng set c = [] for batch_index in xrange(n_train_batches): c.append(train_da(batch_index)) #cor_reg_val += 1 #da.cor_reg = theano.shared(cor_reg_val) update_reg() X1H = rec_x1() X2H = rec_x2() X1H *= x_std X1H += x_mean X2H *= y_std X2H += y_mean H1 = fprop_x1() H2 = fprop_x2() print 'Training epoch' print 'Reconstruction ', numpy.mean(numpy.mean((X1H-X1)**2,1)),\ numpy.mean(numpy.mean((X2H-X2)**2,1)) if epoch % 5 == 2: # pretrain middle layer print '... pre-training MIDDLE layer' H1t = fprop_x1t() H2t = fprop_x2t() h1 = T.matrix('x') # the data is presented as rasterized images h2 = T.matrix('y') # the labels are presented as 1D vector of from mlp import HiddenLayer numpy_rng = numpy.random.RandomState(89677) log_reg = HiddenLayer(numpy_rng, h1, 50, 50, activation=T.tanh) if 1: # for middle layer learning_rate = 0.1 #H1=theano.shared(H1) #H2=theano.shared(H2) # compute the gradients with respect to the model parameters logreg_cost = log_reg.mse(h2) gparams = T.grad(logreg_cost, log_reg.params) # compute list of fine-tuning updates updates = [(param, param - gparam * learning_rate) for param, gparam in zip(log_reg.params, gparams)] train_fn_middle = theano.function(inputs=[], outputs=logreg_cost, updates=updates, givens={ h1: theano.shared(H1t), h2: theano.shared(H2t) }, name='train_middle') epoch = 0 while epoch < 100: print epoch, train_fn_middle() epoch += 1 ##X2H=fprop_x1_to_x2() X2H = numpy.tanh(H1.dot(log_reg.W.eval()) + log_reg.b.eval()) X2H = numpy.tanh(X2H.dot(da.W2_prime.eval()) + da.b2_prime.eval()) X2H *= y_std X2H += y_mean print 'Regression ', numpy.mean(numpy.mean((X2H - X2)**2, 1)) print 'Correlation ', cor_cost(H1, H2) end_time = time.clock() training_time = (end_time - start_time) print >> sys.stderr, ('The no corruption code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((training_time) / 60.)) image = Image.fromarray( tile_raster_images(X=da.W1.get_value(borrow=True).T, img_shape=(28, 14), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_corruption_0.png') from matplotlib import pyplot as pp pp.plot(H1[:10, :2], 'b') pp.plot(H2[:10, :2], 'r') pp.show() print cor
def __init__(self, numpy_rng, theano_rng=None, n_ins=247, hidden_layers_sizes=[200], n_outs=100, corruption_levels=[0.1, 0.1]): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(123)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images #self.x = T.vector('x') # end-snippet-1 # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP # start-snippet-2 for i in range(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # end-snippet-2 '''
def __init__(self, config, verbose=True): ''' @config: GRCNNConfiger. Configer used to set the architecture of GRCNNEncoder. ''' self.encoder = GrCNNEncoder(config, verbose) # Link two parts self.input = self.encoder.input # Activation function self.act = Activation(config.activation) # Extract the hierarchical representation, the pyramids, from the encoder # Combine the original time series and the compressed time series self.pyramids = self.encoder.pyramids self.pyramids = T.concatenate([ self.encoder.hidden0.dimshuffle('x', 0, 1), self.encoder.pyramids ]) self.nsteps = self.pyramids.shape[0] # Use another scan function to compress each hierarchical representation # into the vector representation self.hierarchies, _ = theano.scan( fn=self._step_compress, sequences=[T.arange(self.nsteps, 0, -1), self.pyramids]) # Global classifier, MLP, mixture of experts self.hidden_layer = HiddenLayer(self.hierarchies, (config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) # Adding dropout support self.hidden = self.hidden_layer.output srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1 - config.dropout, size=self.hidden.shape) self.hidden *= T.cast(mask, floatX) # Connect the hidden layer after dropout to a logistic output layer self.output_layer = LogisticLayer(self.hidden, config.num_mlp) self.experts = self.output_layer.output # Global weighting mechanism, voting weights self.weight_layer = theano.shared( name='Weighting vector', value=np.random.rand(config.num_hidden).astype(floatX)) self.weights = T.nnet.softmax( T.dot(self.hierarchies, self.weight_layer)) # Compute the total number of parameters in the model self.num_params = self.encoder.num_params + self.hidden_layer.num_params + \ self.output_layer.num_params + config.num_hidden # Final decision, bagging self.score = T.sum(T.flatten(self.experts) * T.flatten(self.weights)) # Prediction for classification self.pred = self.score >= 0.5 # Stack all the parameters self.params = [] self.params += self.encoder.params self.params += self.hidden_layer.params self.params += self.output_layer.params self.params += [self.weight_layer] # Build objective function for binary classification problem self.truth = T.iscalar(name='label') self.cost = -self.truth * T.log((self.score+np.finfo(float).eps) / (1+2*np.finfo(float).eps)) - \ (1-self.truth) * T.log((1.0-self.score+np.finfo(float).eps) / (1+2*np.finfo(float).eps)) ## Weight Decay if config.weight_decay: self.regularizer = self.encoder.L2_loss() + self.hidden_layer.L2_loss() + \ self.output_layer.L2_loss() + T.sum(self.weight_layer ** 2) self.regularizer *= config.weight_decay_parameter self.cost += self.regularizer # Construct gradient vectors self.gradparams = T.grad(self.cost, self.params) # Construct gradient for the input matrix, fine-tuning self.input_grads = T.grad(self.cost, self.input) # Build and compile theano functions self.predict = theano.function(inputs=[self.input], outputs=self.pred) self.bagging = theano.function(inputs=[self.input], outputs=self.score) self.compute_gradient_and_cost = theano.function( inputs=[self.input, self.truth], outputs=self.gradparams + [self.cost, self.pred]) self.compute_input_gradient = theano.function( inputs=[self.input, self.truth], outputs=self.input_grads) # Theano functions for debugging purposes self.show_weights = theano.function(inputs=[self.input], outputs=self.weights) self.show_scores = theano.function(inputs=[self.input], outputs=self.experts) self.show_hierarchy = theano.function(inputs=[self.input], outputs=self.hierarchies) self.show_prob = theano.function(inputs=[self.input], outputs=self.score) self.show_cost = theano.function(inputs=[self.input, self.truth], outputs=self.cost) if verbose: logger.debug('GrCNNBagger built finished...') logger.debug( 'Hierarchical structure of GrCNN for classification...') logger.debug('Total number of parameters in the model: %d' % self.num_params)
def __init__(self, numpy_rng=None, useRelu=None, W_distribution=None, LayerNodes=None, dropout=None): self.n_layers = (len(LayerNodes) - 2) self.dA_layers = [] self.dropout_layers = [] self.layers = [] self.x = T.matrix('x') self.y = T.ivector('y') next_layer_input = self.x next_dropout_layer_input = _dropout_from_layer(numpy_rng, self.x, p=dropout[0]) weight_matrix_sizes = zip(LayerNodes, LayerNodes[1:]) layer_counter = 0 for n_in, n_out in weight_matrix_sizes[:-1]: if useRelu == True: activation = relu activation1 = relu if layer_counter == 0: activation2 = T.nnet.sigmoid else: activation2 = T.nnet.softplus else: activation = T.nnet.sigmoid activation1 = T.nnet.sigmoid activation2 = T.nnet.sigmoid W_bound = 4. * numpy.sqrt(6. / (n_in + n_out)) next_dropout_layer = DropoutHiddenLayer( numpy_rng=numpy_rng, input=next_dropout_layer_input, activation=activation, n_in=n_in, n_out=n_out, W_distribution=W_distribution, W_bound=W_bound, dropout_rate=dropout[layer_counter + 1]) self.dropout_layers.append(next_dropout_layer) next_dropout_layer_input = next_dropout_layer.output next_layer = HiddenLayer(numpy_rng=numpy_rng, input=next_layer_input, activation=activation, n_in=n_in, n_out=n_out, W=next_dropout_layer.W * (1 - dropout[layer_counter]), b=next_dropout_layer.b) self.layers.append(next_layer) dA_layer = dA(numpy_rng=numpy_rng, input=next_layer_input, useRelu=useRelu, activation1=activation1, activation2=activation2, n_visible=n_in, n_hidden=n_out, W=next_dropout_layer.W, b=next_dropout_layer.b) self.dA_layers.append(dA_layer) next_layer_input = next_layer.output if layer_counter == 0: self.L1 = abs(next_dropout_layer.W).sum() self.L2 = (next_dropout_layer.W**2).sum() else: self.L1 = self.L1 + abs(next_dropout_layer.W).sum() self.L2 = self.L2 + (next_dropout_layer.W**2).sum() layer_counter += 1 n_in, n_out = weight_matrix_sizes[-1] dropout_output_layer = LogisticRegression( input=next_dropout_layer_input, n_in=n_in, n_out=n_out) self.dropout_layers.append(dropout_output_layer) self.L1 = self.L1 + abs(dropout_output_layer.W).sum() self.L2 = self.L2 + (dropout_output_layer.W**2).sum() self.dropout_negative_log_likelihood = self.dropout_layers[ -1].negative_log_likelihood(self.y) output_layer = LogisticRegression(input=next_layer_input, n_in=n_in, n_out=n_out, W=dropout_output_layer.W * (1 - dropout[-1]), b=dropout_output_layer.b) self.layers.append(output_layer) self.error = self.layers[-1].error(self.y) self.sensitivity = self.layers[-1].sensitivity(self.y) self.specificity = self.layers[-1].specificity(self.y) self.class1_pred = self.layers[-1].class1_pred(self.y) self.params = [ param for layer in self.dropout_layers for param in layer.params ]
def __init__(self, config=None, verbose=True): # Construct two GrCNNEncoders for matching two sentences self.encoderL = GrCNNEncoder(config, verbose) self.encoderR = GrCNNEncoder(config, verbose) # Link two parts self.params = [] self.params += self.encoderL.params self.params += self.encoderR.params self.inputL = self.encoderL.input self.inputR = self.encoderR.input # Get output of two GrCNNEncoders self.hiddenL = self.encoderL.output self.hiddenR = self.encoderR.output # Activation function self.act = Activation(config.activation) # MLP Component self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=1) self.hidden_layer = HiddenLayer( self.hidden, (2 * config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) self.compressed_hidden = self.hidden_layer.output # Accumulate parameters self.params += self.hidden_layer.params # Dropout parameter srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hidden.shape) self.compressed_hidden *= T.cast(mask, floatX) # Use concatenated vector as input to the logistic regression classifier self.logistic_layer = LogisticLayer(self.compressed_hidden, config.num_mlp) self.output = self.logistic_layer.output self.pred = self.logistic_layer.pred # Accumulate parameters self.params += self.logistic_layer.params # Compute the total number of parameters in this model self.num_params_encoder = config.num_input * config.num_hidden + \ config.num_hidden * config.num_hidden * 2 + \ config.num_hidden + \ config.num_hidden * 3 * 2 + \ 3 self.num_params_encoder *= 2 self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + \ config.num_mlp + \ config.num_mlp + 1 self.num_params = self.num_params_encoder + self.num_params_classifier # Build target function self.truth = T.ivector(name='label') self.learn_rate = T.scalar(name='learning rate') self.cost = self.logistic_layer.NLL_loss(self.truth) # Build computational graph and compute the gradient of the target function # with respect to model parameters self.gradparams = T.grad(self.cost, self.params) # Updates formula for stochastic descent algorithm self.updates = [] for param, gradparam in zip(self.params, self.gradparams): self.updates.append((param, param - self.learn_rate * gradparam)) # Compile theano function self.objective = theano.function( inputs=[self.inputL, self.inputR, self.truth], outputs=self.cost) self.predict = theano.function(inputs=[self.inputL, self.inputR], outputs=self.pred) # Compute the gradient of the objective function with respect to the model parameters self.compute_cost_and_gradient = theano.function( inputs=[self.inputL, self.inputR, self.truth], outputs=self.gradparams + [self.cost, self.pred]) # Output function for debugging purpose self.show_hidden = theano.function( inputs=[self.inputL, self.inputR, self.truth], outputs=self.hidden) self.show_compressed_hidden = theano.function( inputs=[self.inputL, self.inputR, self.truth], outputs=self.compressed_hidden) self.show_output = theano.function( inputs=[self.inputL, self.inputR, self.truth], outputs=self.output) if verbose: logger.debug( 'Architecture of GrCNNMatcher built finished, summarized below: ' ) logger.debug('Input dimension: %d' % config.num_input) logger.debug('Hidden dimension inside GrCNNMatcher pyramid: %d' % config.num_hidden) logger.debug('Hidden dimension of MLP: %d' % config.num_mlp) logger.debug('Number of parameters in encoder part: %d' % self.num_params_encoder) logger.debug('Number of parameters in classifier part: %d' % self.num_params_classifier) logger.debug('Number of total parameters in this model: %d' % self.num_params)
def __init__(self, config=None, verbose=True): # Construct two GrCNNEncoders for matching two sentences self.encoderL = ExtGrCNNEncoder(config, verbose) self.encoderR = ExtGrCNNEncoder(config, verbose) # Link the parameters of two parts self.params = [] self.params += self.encoderL.params self.params += self.encoderR.params # Build three kinds of inputs: # 1, inputL, inputR. This pair is used for computing the score after training # 2, inputPL, inputPR. This part is used for training positive pairs # 3, inputNL, inputNR. This part is used for training negative pairs self.inputL = self.encoderL.input self.inputR = self.encoderR.input # Positive self.inputPL = T.matrix(name='inputPL', dtype=floatX) self.inputPR = T.matrix(name='inputPR', dtype=floatX) # Negative self.inputNL = T.matrix(name='inputNL', dtype=floatX) self.inputNR = T.matrix(name='inputNR', dtype=floatX) # Linking input-output mapping self.hiddenL = self.encoderL.output self.hiddenR = self.encoderR.output # Positive self.hiddenPL = self.encoderL.encode(self.inputPL) self.hiddenPR = self.encoderR.encode(self.inputPR) # Negative self.hiddenNL = self.encoderL.encode(self.inputNL) self.hiddenNR = self.encoderR.encode(self.inputNR) # Activation function self.act = Activation(config.activation) # MLP Component self.hidden = T.concatenate([self.hiddenL, self.hiddenR], axis=1) self.hiddenP = T.concatenate([self.hiddenPL, self.hiddenPR], axis=1) self.hiddenN = T.concatenate([self.hiddenNL, self.hiddenNR], axis=1) # Build hidden layer self.hidden_layer = HiddenLayer( self.hidden, (2 * config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) self.compressed_hidden = self.hidden_layer.output self.compressed_hiddenP = self.hidden_layer.encode(self.hiddenP) self.compressed_hiddenN = self.hidden_layer.encode(self.hiddenN) # Accumulate parameters self.params += self.hidden_layer.params # Dropout parameter srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hidden.shape) maskP = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hiddenP.shape) maskN = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hiddenN.shape) self.compressed_hidden *= T.cast(mask, floatX) self.compressed_hiddenP *= T.cast(maskP, floatX) self.compressed_hiddenN *= T.cast(maskN, floatX) # Score layers self.score_layer = ScoreLayer(self.compressed_hidden, config.num_mlp) self.output = self.score_layer.output self.scoreP = self.score_layer.encode(self.compressed_hiddenP) self.scoreN = self.score_layer.encode(self.compressed_hiddenN) # Accumulate parameters self.params += self.score_layer.params # Build cost function self.cost = T.mean( T.maximum(T.zeros_like(self.scoreP), 1.0 - self.scoreP + self.scoreN)) # Construct the gradient of the cost function with respect to the model parameters self.gradparams = T.grad(self.cost, self.params) # Compute the total number of parameters in the model self.num_params_encoder = self.encoderL.num_params + self.encoderR.num_params self.num_params_classifier = 2 * config.num_hidden * config.num_mlp + \ config.num_mlp + \ config.num_mlp + 1 self.num_params = self.num_params_encoder + self.num_params_classifier # Build class methods self.score = theano.function(inputs=[self.inputL, self.inputR], outputs=self.output) self.compute_cost_and_gradient = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=self.gradparams + [self.cost, self.scoreP, self.scoreN]) self.show_scores = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.scoreP, self.scoreN]) self.show_hiddens = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.hiddenP, self.hiddenN]) self.show_inputs = theano.function( inputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR], outputs=[self.inputPL, self.inputPR, self.inputNL, self.inputNR]) if verbose: logger.debug( 'Architecture of ExtGrCNNMatchScorer built finished, summarized below: ' ) logger.debug('Input dimension: %d' % config.num_input) logger.debug( 'Hidden dimension inside GrCNNMatchScorer pyramid: %d' % config.num_hidden) logger.debug('Hidden dimension MLP: %d' % config.num_mlp) logger.debug('Number of Gating functions: %d' % config.num_gates) logger.debug('There are 2 ExtGrCNNEncoders used in model.') logger.debug('Total number of parameters used in the model: %d' % self.num_params)
def __init__(self, config=None, verbose=True): self.encoder = GrCNNEncoder(config, verbose) # Link two parts self.params = self.encoder.params self.input = self.encoder.input self.hidden = self.encoder.output # Activation function self.act = Activation(config.activation) # MLP Component self.hidden_layer = HiddenLayer(self.hidden, (config.num_hidden, config.num_mlp), act=Activation(config.hiddenact)) self.compressed_hidden = self.hidden_layer.output # Dropout regularization srng = T.shared_randomstreams.RandomStreams(config.random_seed) mask = srng.binomial(n=1, p=1 - config.dropout, size=self.compressed_hidden.shape) self.compressed_hidden *= T.cast(mask, floatX) # Accumulate model parameters self.params += self.hidden_layer.params # Softmax Component self.softmax_layer = SoftmaxLayer(self.compressed_hidden, (config.num_mlp, config.num_class)) self.raw_output = self.softmax_layer.output self.pred = self.softmax_layer.pred self.params += self.softmax_layer.params # Compute the total number of parameters in this model self.num_params_encoder = config.num_input * config.num_hidden + \ config.num_hidden * config.num_hidden * 2 + \ config.num_hidden + \ config.num_hidden * 3 * 2 + \ 3 self.num_params_classifier = config.num_hidden * config.num_mlp + \ config.num_mlp + \ config.num_mlp * config.num_class + \ config.num_class self.num_params = self.num_params_encoder + self.num_params_classifier # Build target function self.truth = T.ivector(name='label') self.learn_rate = T.scalar(name='learning rate') self.cost = self.softmax_layer.NLL_loss(self.truth) # Build computational graph and compute the gradient of the target # function with respect to model parameters self.gradparams = T.grad(self.cost, self.params) # Updates formula for stochastic gradient descent algorithm self.updates = [] for param, gradparam in zip(self.params, self.gradparams): self.updates.append((param, param - self.learn_rate * gradparam)) # Compile theano function self.objective = theano.function(inputs=[self.input, self.truth], outputs=self.cost) self.predict = theano.function(inputs=[self.input], outputs=self.pred) # Compute the gradient of the objective function with respect to the model parameters self.compute_cost_and_gradient = theano.function( inputs=[self.input, self.truth], outputs=self.gradparams + [self.cost]) # Output function for debugging purpose self.show_hidden = theano.function(inputs=[self.input, self.truth], outputs=self.hidden) self.show_compressed_hidden = theano.function( inputs=[self.input, self.truth], outputs=self.compressed_hidden) self.show_output = theano.function(inputs=[self.input, self.truth], outputs=self.raw_output) if verbose: logger.debug( 'Architecture of GrCNN built finished, summarized as below: ') logger.debug('Input dimension: %d' % config.num_input) logger.debug('Hidden dimension inside GrCNNEncoder pyramid: %d' % config.num_hidden) logger.debug('Hidden dimension of MLP: %d' % config.num_mlp) logger.debug('Number of target classes: %d' % config.num_class) logger.debug('Number of parameters in encoder part: %d' % self.num_params_encoder) logger.debug('Number of parameters in classifier part: %d' % self.num_params_classifier) logger.debug('Number of total parameters in this model: %d' % self.num_params)
def test_CNN(learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, n_epochs=1000, batch_size=20, n_hidden=500, dataset='txtData7.pkl'): dataset = load_data(dataset) ''' train_set_x.get_value(); tt.shape ---(50000, 784)''' train_set_x, train_set_y = dataset[0] valid_set_x, valid_set_y = dataset[1] test_set_x, test_set_y = dataset[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size print('training set has %i batches' % n_train_batches) print('validate set has %i batches' % n_valid_batches) print('testing set has %i batches' % n_test_batches) #symbolic variables x = T.matrix() y = T.ivector() #lvector: [long int] labels; ivector:[int] labels minibatch_index = T.lscalar() print 'build the model...' rng = numpy.random.RandomState(23455) # transfrom x from (batchsize, 28*28) to (batchsize,feature,28,28)) # I_shape = (28,28),F_shape = (5,5), #第一层卷积、池化后 第一层卷积核为20个,每一个样本图片都产生20个特征图, N_filters_0 = 20 D_features_0 = 1 #输入必须是为四维的,所以需要用到reshape,这一层的输入是一批样本是20个样本,28*28, layer0_input = x.reshape((batch_size, D_features_0, 40, 36)) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, filter_shape=(N_filters_0, D_features_0, 5, 5), image_shape=(batch_size, 1, 40, 36)) #layer0.output: (batch_size, N_filters_0, (28-5+1)/2, (28-5+1)/2) -> 20*20*12*12 #卷积之后得到24*24 在经过池化以后得到12*12. 最后输出的格式为20个样本,20个12*12的特征图。卷积操作是对应的窗口呈上一个卷积核参数 相加在求和得到一个特 #征图中的像素点数 这里池化采用最大池化 减少了参数的训练。 N_filters_1 = 50 D_features_1 = N_filters_0 layer1 = LeNetConvPoolLayer(rng, input=layer0.output, filter_shape=(N_filters_1, D_features_1, 5, 5), image_shape=(batch_size, N_filters_0, 18, 16)) # layer1.output: (20,50,4,4) #第二层输出为20个样本,每一个样本图片对应着50张4*4的特征图,其中的卷积和池化操作都是同第一层layer0是一样的。 #这一层是将上一层的输出的样本的特征图进行一个平面化,也就是拉成一个一维向量,最后变成一个20*800的矩阵,每一行代表一个样本, #(20,50,4,4)->(20,(50*4*4)) layer2_input = layer1.output.flatten(2) #上一层的输出变成了20*800的矩阵,通过全连接,隐层操作,将800变成了500个神经元,里面涉及到全连接。 layer2 = HiddenLayer(rng, layer2_input, n_in=50 * 7 * 6, n_out=500, activation=T.tanh) #这里为逻辑回归层,主要是softmax函数作为输出, layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=3) #约束规则 ########################## cost = (layer3.negative_log_likelihood(y) + L1_reg * (layer2.L1_1 + layer3.L1_2) + L2_reg * (layer2.L2_1 + layer3.L2_2)) test_model = theano.function( inputs=[minibatch_index], outputs=layer3.errors(y), givens={ x: test_set_x[minibatch_index * batch_size:(minibatch_index + 1) * batch_size], y: test_set_y[minibatch_index * batch_size:(minibatch_index + 1) * batch_size] }) valid_model = theano.function( inputs=[minibatch_index], outputs=layer3.errors(y), givens={ x: valid_set_x[minibatch_index * batch_size:(minibatch_index + 1) * batch_size], y: valid_set_y[minibatch_index * batch_size:(minibatch_index + 1) * batch_size] }) params = layer3.params + layer2.params + layer1.params + layer0.params gparams = T.grad(cost, params) updates = [] for par, gpar in zip(params, gparams): updates.append((par, par - learning_rate * gpar)) train_model = theano.function( inputs=[minibatch_index], outputs=[cost], updates=updates, givens={ x: train_set_x[minibatch_index * batch_size:(minibatch_index + 1) * batch_size], y: train_set_y[minibatch_index * batch_size:(minibatch_index + 1) * batch_size] }) #---------------------Train-----------------------# print 'training...' print('training set has %i batches' % n_train_batches) print('validate set has %i batches' % n_valid_batches) print('testing set has %i batches' % n_test_batches) epoch = 0 patience = 10000 patience_increase = 2 validation_frequency = min(n_train_batches, patience / 2) improvement_threshold = 0.995 best_parameters = None min_validation_error = numpy.inf done_looping = False start_time = time.clock() while (epoch < n_epochs) and (not done_looping): epoch += 1 for minibatch_index in xrange(n_train_batches): #cur_batch_train_error,cur_params = train_model(minibatch_index) cur_batch_train_error = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: #validation_error = numpy.mean([valid_model(idx) for idx in xrange(n_valid_batches)]) validation_losses = [ valid_model(i) for i in xrange(n_valid_batches) ] validation_error = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, validation_error * 100.)) if validation_error < min_validation_error: if validation_error < min_validation_error * improvement_threshold: patience = max(patience, iter * patience_increase) min_validation_error = validation_error #best_parameters = cur_params best_iter = iter save_params(layer0.params, layer1.params, layer2.params, layer3.params) test_error = numpy.mean( [test_model(idx) for idx in xrange(n_test_batches)]) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_error * 100.)) if iter >= patience: done_looping = True break end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (100 - min_validation_error * 100., best_iter + 1, 100 - test_error * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, rng, n_in=784, n_hidden=[500, 500], n_out=10, lambda_reg=0.001, alpha_reg=0.001): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_in: int :param n_in: dimension of the input to the DBN :type n_hidden: list of ints :param n_hidden: intermediate layers size, must contain at least one value :type n_out: int :param n_out: dimension of the output of the network :type lambda_reg: float :param lambda_reg: paramter to control the sparsity of weights by l_1 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ). Thus, the larger lambda_reg is, the sparser the weights are. :type alpha_reg: float :param alpha_reg: paramter from interval [0,1] to control the smoothness of weights by squared l_2 norm. The regularization term is lambda_reg( (1-alpha_reg)/2 * ||W||_2^2 + alpha_reg ||W||_1 ), Thus, the smaller alpha_reg is, the smoother the weights are. """ self.hidden_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(n_hidden) assert self.n_layers > 0 # allocate symbolic variables for the data self.x = T.matrix('x') # the data, each row is a sample self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_in else: input_size = n_hidden[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.hidden_layers[-1].output sigmoid_layer = HiddenLayer(rng=rng, input=layer_input, n_in=input_size, n_out=n_hidden[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.hidden_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=rng, theano_rng=None, input=layer_input, n_visible=input_size, n_hidden=n_hidden[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP if self.n_layers > 0: self.logRegressionLayer = LogisticRegression( input=self.hidden_layers[-1].output, n_in=n_hidden[-1], n_out=n_out) else: self.logRegressionLayer = LogisticRegression(input=self.x, n_in=input_size, n_out=n_out) self.params.extend(self.logRegressionLayer.params) # regularization L1s = [] L2_sqrs = [] for i in range(self.n_layers): L1s.append(abs(self.hidden_layers[i].W).sum()) L2_sqrs.append((self.hidden_layers[i].W**2).sum()) L1s.append(abs(self.logRegressionLayer.W).sum()) L2_sqrs.append((self.logRegressionLayer.W**2).sum()) self.L1 = T.sum(L1s) self.L2_sqr = T.sum(L2_sqrs) # compute the cost for second phase of training, # defined as the negative log likelihood self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood( self.y) self.cost=self.negative_log_likelihood + \ lambda_reg * ( (1.0-alpha_reg)*0.5* self.L2_sqr + alpha_reg*self.L1) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logRegressionLayer.errors(self.y) self.y_pred = self.logRegressionLayer.y_pred self.y_pred_prob = self.logRegressionLayer.y_pred_prob
def evaluate_convnet(learning_rate=0.02, n_epochs=2000, dataset='single_sphere', nkerns=[32, 64, 64, 128], batch_size=128, filter_shapes=[[5, 5], [5, 5], [3, 3], [3, 3]], momentum=0.9, half_time=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets, max_row, max_col = load_char_data( ) #load_latline_dataset() # << TODO implement train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.tensor4( 'x') # the data is presented as spiking of sensors at lateral line y = T.ivector( 'y') # The output is the distance (in x- and y-directions) of sphere idxs = T.matrix('idxs') ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # Reshape matrix of sensor detections to a 4D tensor layer0_input = x # x.reshape((batch_size, depth_dim, conv_dims[0], conv_dims[1])) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = conv_layer(rng, input=layer0_input, image_shape=(batch_size, 3, max_row, max_col), filter_shape=(nkerns[0], 3, filter_shapes[0][0], filter_shapes[0][1]), pooling=False, activation=T.nnet.relu) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = conv_layer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], max_row, max_col), filter_shape=(nkerns[1], nkerns[0], filter_shapes[1][0], filter_shapes[1][1]), pooling=True, poolsize=(2, 2), activation=T.nnet.relu, keepDims=True) layer1b = conv_layer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], max_row, max_col), filter_shape=(nkerns[2], nkerns[1], filter_shapes[2][0], filter_shapes[2][1]), pooling=False, activation=T.nnet.relu, keepDims=True) layer1c = conv_layer(rng, input=layer1b.output, image_shape=(batch_size, nkerns[2], max_row, max_col), filter_shape=(nkerns[3], nkerns[2], filter_shapes[3][0], filter_shapes[3][1]), pooling=False, activation=T.nnet.relu, keepDims=True) spp_layer = SPP(layer1c.output, idxs) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = spp_layer.output # construct a fully-connected ReLU layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=spp_layer.M * nkerns[-1], n_out=500, activation=T.nnet.relu) layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=39) # linear regression by using a fully connected layer '''layer3 = HiddenLayer( rng, input=layer2.output, n_in=conv_dims[1] * 2, n_out=2, activation=None ) ''' # classify the values of the fully-connected sigmoidal layer #layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) demo_model = theano.function( [index], [layer3.y_pred, y], givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params + layer1b.params + layer1c.params # create a list of gradients for all model parameters #grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. #updates = [ # (param_i, param_i - learning_rate * grad_i) # for param_i, grad_i in zip(params, grads) #] l_r = T.scalar('l_r', dtype=theano.config.floatX) updates = gradient_updates_momentum(cost, params, l_r, momentum) train_model = theano.function( [index, l_r], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 if epoch % half_time == 0: learning_rate /= 2 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index, learning_rate) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print('Best validation MSE of %f %% obtained at iteration %i, ' 'with test MSE %f ' % (best_validation_loss, best_iter + 1, test_score)) print( ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) demo_outputs = [demo_model(i) for i in range(n_test_batches)] sensor_range = [-1.5, 1.5] y_range = [0, 1] plt.ion() plotting = False MED = 0 for i in range(n_test_batches): predicted, target = demo_outputs[i] for j in range(predicted.shape[0]): x_hat, y_hat = predicted[j] x, y = target[j] MED += numpy.sqrt((x - x_hat)**2 + (y - y_hat)**2) if plotting: plt.clf() plt.plot([x_hat], [y_hat], 'ro') plt.plot([x], [y], 'g+') plt.grid() plt.axis( [sensor_range[0], sensor_range[1], y_range[0], y_range[1]]) plt.pause(0.05) MED /= 2000 print('MED = %f\n' % MED)
def evaluate_lenet5(learning_rate=0.008, n_epochs=2000, nkerns=[400], batch_size=1, window_width=3, maxSentLength=30, emb_size=300, hidden_size=[300,10], margin=0.5, L2_weight=0.0001, Div_reg=0.0001, norm_threshold=5.0, use_svm=False): model_options = locals().copy() print "model options", model_options rootPath='/mounts/data/proj/wenpeng/Dataset/MicrosoftParaphrase/tokenized_msr/'; rng = numpy.random.RandomState(23455) datasets, word2id=load_msr_corpus_20161229(rootPath+'tokenized_train.txt', rootPath+'tokenized_test.txt', maxSentLength) vocab_size=len(word2id)+1 mtPath='/mounts/data/proj/wenpeng/Dataset/paraphraseMT/' mt_train, mt_test=load_mts(mtPath+'concate_15mt_train.txt', mtPath+'concate_15mt_test.txt') wm_train, wm_test=load_wmf_wikiQA(rootPath+'train_number_matching_scores.txt', rootPath+'test_number_matching_scores.txt') indices_train, trainY, trainLengths, normalized_train_length, trainLeftPad, trainRightPad= datasets[0] indices_train_l=indices_train[::2] indices_train_r=indices_train[1::2] trainLengths_l=trainLengths[::2] trainLengths_r=trainLengths[1::2] normalized_train_length_l=normalized_train_length[::2] normalized_train_length_r=normalized_train_length[1::2] trainLeftPad_l=trainLeftPad[::2] trainLeftPad_r=trainLeftPad[1::2] trainRightPad_l=trainRightPad[::2] trainRightPad_r=trainRightPad[1::2] indices_test, testY, testLengths,normalized_test_length, testLeftPad, testRightPad= datasets[1] indices_test_l=indices_test[::2] indices_test_r=indices_test[1::2] testLengths_l=testLengths[::2] testLengths_r=testLengths[1::2] normalized_test_length_l=normalized_test_length[::2] normalized_test_length_r=normalized_test_length[1::2] testLeftPad_l=testLeftPad[::2] testLeftPad_r=testLeftPad[1::2] testRightPad_l=testRightPad[::2] testRightPad_r=testRightPad[1::2] train_size = len(indices_train_l) test_size = len(indices_test_l) train_batch_start=range(train_size) test_batch_start=range(test_size) # indices_train_l=theano.shared(numpy.asarray(indices_train_l, dtype=theano.config.floatX), borrow=True) # indices_train_r=theano.shared(numpy.asarray(indices_train_r, dtype=theano.config.floatX), borrow=True) # indices_test_l=theano.shared(numpy.asarray(indices_test_l, dtype=theano.config.floatX), borrow=True) # indices_test_r=theano.shared(numpy.asarray(indices_test_r, dtype=theano.config.floatX), borrow=True) # indices_train_l=T.cast(indices_train_l, 'int32') # indices_train_r=T.cast(indices_train_r, 'int32') # indices_test_l=T.cast(indices_test_l, 'int32') # indices_test_r=T.cast(indices_test_r, 'int32') rand_values=random_value_normal((vocab_size, emb_size), theano.config.floatX, rng) # rand_values[0]=numpy.array(numpy.zeros(emb_size)) id2word = {y:x for x,y in word2id.iteritems()} word2vec=load_word2vec() rand_values=load_word2vec_to_init_new(rand_values, id2word, word2vec) embeddings=theano.shared(value=numpy.array(rand_values,dtype=theano.config.floatX), borrow=True)#theano.shared(value=rand_values, borrow=True) # allocate symbolic variables for the data # index = T.iscalar() x_index_l = T.imatrix() # now, x is the index matrix, must be integer x_index_r = T.imatrix() y = T.ivector() left_l=T.iscalar() right_l=T.iscalar() left_r=T.iscalar() right_r=T.iscalar() length_l=T.iscalar() length_r=T.iscalar() norm_length_l=T.fscalar() norm_length_r=T.fscalar() mts=T.fmatrix() wmf=T.fmatrix() # cost_tmp=T.fscalar() #x=embeddings[x_index.flatten()].reshape(((batch_size*4),maxSentLength, emb_size)).transpose(0, 2, 1).flatten() ishape = (emb_size, maxSentLength) # this is the size of MNIST images filter_size=(emb_size,window_width) #poolsize1=(1, ishape[1]-filter_size[1]+1) #????????????????????????????? length_after_wideConv=ishape[1]+filter_size[1]-1 ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer #layer0_input = x.reshape(((batch_size*4), 1, ishape[0], ishape[1])) layer0_l_input = embeddings[x_index_l.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) layer0_r_input = embeddings[x_index_r.flatten()].reshape((batch_size,maxSentLength, emb_size)).dimshuffle(0, 'x', 2,1) conv_W, conv_b=create_conv_para(rng, filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1])) conv_W_into_matrix=conv_W.reshape((conv_W.shape[0], conv_W.shape[2]*conv_W.shape[3])) #layer0_output = debug_print(layer0.output, 'layer0.output') layer0_l = Conv_with_input_para(rng, input=layer0_l_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_r = Conv_with_input_para(rng, input=layer0_r_input, image_shape=(batch_size, 1, ishape[0], ishape[1]), filter_shape=(nkerns[0], 1, filter_size[0], filter_size[1]), W=conv_W, b=conv_b) layer0_l_output=debug_print(layer0_l.output, 'layer0_l.output') layer0_r_output=debug_print(layer0_r.output, 'layer0_r.output') layer0_l_output_maxpool = T.max(layer0_l.output_narrow_conv_out[:,:,:,left_l:], axis=3).reshape((1, nkerns[0])) layer0_r_output_maxpool = T.max(layer0_r.output_narrow_conv_out[:,:,:,left_r:], axis=3).reshape((1, nkerns[0])) layer1=Average_Pooling_for_Top(rng, input_l=layer0_l_output, input_r=layer0_r_output, kern=nkerns[0], left_l=left_l, right_l=right_l, left_r=left_r, right_r=right_r, length_l=length_l+filter_size[1]-1, length_r=length_r+filter_size[1]-1, dim=maxSentLength+filter_size[1]-1) sum_uni_l=T.sum(layer0_l_input[:,:,:,left_l:], axis=3).reshape((1, emb_size)) norm_uni_l=sum_uni_l/T.sqrt((sum_uni_l**2).sum()) sum_uni_r=T.sum(layer0_r_input[:,:,:,left_r:], axis=3).reshape((1, emb_size)) norm_uni_r=sum_uni_r/T.sqrt((sum_uni_r**2).sum()) uni_cosine=cosine(sum_uni_l, sum_uni_r) ''' linear=Linear(sum_uni_l, sum_uni_r) poly=Poly(sum_uni_l, sum_uni_r) sigmoid=Sigmoid(sum_uni_l, sum_uni_r) rbf=RBF(sum_uni_l, sum_uni_r) gesd=GESD(sum_uni_l, sum_uni_r) ''' eucli_1=1.0/(1.0+EUCLID(sum_uni_l, sum_uni_r))#25.2% #eucli_1=EUCLID(sum_uni_l, sum_uni_r) len_l=norm_length_l.reshape((1,1)) len_r=norm_length_r.reshape((1,1)) ''' len_l=length_l.reshape((1,1)) len_r=length_r.reshape((1,1)) ''' #length_gap=T.log(1+(T.sqrt((len_l-len_r)**2))).reshape((1,1)) #length_gap=T.sqrt((len_l-len_r)**2) #layer3_input=mts HL_layer_1_input=T.concatenate([ # mts, eucli_1, #uni_cosine,norm_uni_l-(norm_uni_l+norm_uni_r)/2,#uni_cosine, # uni_cosine, # sum_uni_l, # sum_uni_r, # sum_uni_l+sum_uni_r, 1.0/(1.0+EUCLID(layer0_l_output_maxpool, layer0_r_output_maxpool)), cosine(layer0_l_output_maxpool, layer0_r_output_maxpool), layer0_l_output_maxpool, layer0_r_output_maxpool, T.sqrt((layer0_l_output_maxpool-layer0_r_output_maxpool)**2+1e-10), layer1.output_eucli_to_simi, #layer1.output_cosine,layer1.output_vector_l-(layer1.output_vector_l+layer1.output_vector_r)/2,#layer1.output_cosine, # layer1.output_cosine, layer1.output_vector_l, layer1.output_vector_r, T.sqrt((layer1.output_vector_l-layer1.output_vector_r)**2+1e-10), # len_l, len_r layer1.output_attentions # wmf, ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_with_extra=T.concatenate([#HL_layer_1_input, mts, len_l, len_r # wmf ], axis=1)#, layer2.output, layer1.output_cosine], axis=1) HL_layer_1_input_size=1+1+ 1+1+3* nkerns[0] +1+1+3*nkerns[0]+10*10 HL_layer_1_input_with_extra_size = HL_layer_1_input_size+15+2 HL_layer_1=HiddenLayer(rng, input=HL_layer_1_input, n_in=HL_layer_1_input_size, n_out=hidden_size[0], activation=T.tanh) HL_layer_2=HiddenLayer(rng, input=HL_layer_1.output, n_in=hidden_size[0], n_out=hidden_size[1], activation=T.tanh) LR_layer_input=T.concatenate([HL_layer_2.output, HL_layer_1.output, HL_layer_1_input],axis=1) LR_layer_input_with_extra=T.concatenate([HL_layer_2.output, HL_layer_1_input_with_extra],axis=1)#HL_layer_1.output, LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=HL_layer_1_input_size+hidden_size[0]+hidden_size[1], n_out=2) # LR_layer_input=HL_layer_2.output # LR_layer=LogisticRegression(rng, input=LR_layer_input, n_in=hidden_size, n_out=2) # layer3=LogisticRegression(rng, input=layer3_input, n_in=15+1+1+2+3, n_out=2) #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum() L2_reg =debug_print((LR_layer.W** 2).sum()+(HL_layer_2.W** 2).sum()+(HL_layer_1.W** 2).sum()+(conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum() # diversify_reg= Diversify_Reg(LR_layer.W.T)+Diversify_Reg(HL_layer_2.W.T)+Diversify_Reg(HL_layer_1.W.T)+Diversify_Reg(conv_W_into_matrix) cost_this =debug_print(LR_layer.negative_log_likelihood(y), 'cost_this')#+L2_weight*L2_reg cost=cost_this+L2_weight*L2_reg#+Div_reg*diversify_reg test_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [LR_layer.errors(y), LR_layer.y_pred, LR_layer_input_with_extra, y], on_unused_input='ignore',allow_input_downcast=True) params = LR_layer.params+ HL_layer_2.params+HL_layer_1.params+[conv_W, conv_b]+[embeddings]#+[embeddings]# + layer1.params accumulator=[] for para_i in params: eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX) accumulator.append(theano.shared(eps_p, borrow=True)) # create a list of gradients for all model parameters grads = T.grad(cost, params) updates = [] for param_i, grad_i, acc_i in zip(params, grads, accumulator): clipped_grad = T.clip(grad_i, -0.5, 0.5) acc = acc_i + T.sqr(clipped_grad) updates.append((param_i, param_i - learning_rate * clipped_grad / T.sqrt(acc+1e-10))) #AdaGrad updates.append((acc_i, acc)) train_model = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost,LR_layer.errors(y)], updates=updates, on_unused_input='ignore',allow_input_downcast=True) train_model_predict = theano.function([x_index_l,x_index_r,y,left_l, right_l, left_r, right_r, length_l, length_r, norm_length_l, norm_length_r, mts,wmf], [cost_this,LR_layer.errors(y), LR_layer_input_with_extra, y],on_unused_input='ignore',allow_input_downcast=True) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 500000000000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() epoch = 0 done_looping = False max_acc=0.0 nn_max_acc=0.0 best_iter=0 cost_tmp=0.0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #for minibatch_index in xrange(n_train_batches): # each batch minibatch_index=0 shuffle(train_batch_start)#shuffle training data for index in train_batch_start: # iter means how many batches have been runed, taking into loop iter = (epoch - 1) * train_size + minibatch_index +1 minibatch_index=minibatch_index+1 # if iter%update_freq != 0: # cost_ij, error_ij, layer3_input, y=train_model_predict(batch_start) # #print 'cost_ij: ', cost_ij # cost_tmp+=cost_ij # error_sum+=error_ij # else: cost_i, error_i= train_model(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) cost_tmp+=cost_i if iter < 6000 and iter %100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) if iter >= 6000 and iter % 100 == 0: # if iter%100 ==0: print 'training @ iter = '+str(iter)+' average cost: '+str(cost_tmp/iter) test_losses=[] test_y=[] test_features=[] for index in test_batch_start: test_loss, pred_y, layer3_input, y=test_model(indices_test_l[index: index + batch_size], indices_test_r[index: index + batch_size], testY[index: index + batch_size], testLeftPad_l[index], testRightPad_l[index], testLeftPad_r[index], testRightPad_r[index], testLengths_l[index], testLengths_r[index], normalized_test_length_l[index], normalized_test_length_r[index], mt_test[index: index + batch_size], wm_test[index: index + batch_size]) #test_losses = [test_model(i) for i in test_batch_start] test_losses.append(test_loss) test_y.append(y[0]) test_features.append(layer3_input[0]) #write_file.write(str(pred_y[0])+'\n')#+'\t'+str(testY[i].eval())+ #write_file.close() test_score = numpy.mean(test_losses) test_acc = (1-test_score) * 100. if test_acc > nn_max_acc: nn_max_acc = test_acc print '\t\t\tepoch:', epoch, 'iter:', iter, 'current acc:', test_acc, 'nn_max_acc:', nn_max_acc #now, see the results of svm if use_svm: train_y=[] train_features=[] for index in train_batch_start: cost_ij, error_ij, layer3_input, y=train_model_predict(indices_train_l[index: index + batch_size], indices_train_r[index: index + batch_size], trainY[index: index + batch_size], trainLeftPad_l[index], trainRightPad_l[index], trainLeftPad_r[index], trainRightPad_r[index], trainLengths_l[index], trainLengths_r[index], normalized_train_length_l[index], normalized_train_length_r[index], mt_train[index: index + batch_size], wm_train[index: index + batch_size]) train_y.append(y[0]) train_features.append(layer3_input[0]) #write_feature.write(' '.join(map(str,layer3_input[0]))+'\n') #write_feature.close() clf = svm.SVC(kernel='linear')#OneVsRestClassifier(LinearSVC()) #linear 76.11%, poly 75.19, sigmoid 66.50, rbf 73.33 clf.fit(train_features, train_y) results=clf.predict(test_features) lr=LinearRegression().fit(train_features, train_y) results_lr=lr.predict(test_features) corr_count=0 corr_lr=0 test_size=len(test_y) for i in range(test_size): if results[i]==test_y[i]: corr_count+=1 if numpy.absolute(results_lr[i]-test_y[i])<0.5: corr_lr+=1 acc=corr_count*1.0/test_size acc_lr=corr_lr*1.0/test_size if acc > max_acc: max_acc=acc best_iter=iter if acc_lr> max_acc: max_acc=acc_lr best_iter=iter print '\t\t\t\tsvm acc: ', acc, 'LR acc: ', acc_lr, ' max acc: ', max_acc , ' at iter: ', best_iter if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print('Best validation score of %f %% obtained at iteration %i,'\ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def classify_lenet5( learning_rate=0.005, n_epochs=8000, image_path='D:/dev/datasets/isbi/train-input/train-input_0000.tif', paramfile='lenet0_membrane_epoch_25100.pkl.gz', nkerns=[20, 50], batch_size=1): rng = numpy.random.RandomState(23455) # allocate symbolic variables for the data index_x = T.lscalar() # index to a [mini]batch index_y = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ishape = (28, 28) # this is the size of MNIST images ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 4 * 4, n_out=500, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=2) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y)
def __init__(self, rng, batch_size=200, nkerns=[35, 70, 35], gamma=1e-6): x = T.matrix() y = T.matrix() learning_rate = T.scalar() print '... building the model' self.batch_size = batch_size self.lowestError = 1. # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 48, 48)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 48, 48), filter_shape=(nkerns[0], 1, 9, 9), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 20, 20), filter_shape=(nkerns[1], nkerns[0], 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer2 = LeNetConvPoolLayer(rng, input=layer1.output, image_shape=(batch_size, nkerns[1], 8, 8), filter_shape=(nkerns[2], nkerns[1], 5, 5), poolsize=(2, 2)) #layer2_param, layer2_out = layer2.params, layer2.output #img_dim = ( img_dim - dim_filter[2] + 1 )/2 layer3_input = layer2.output.flatten(2) layer3 = HiddenLayer(rng, input=layer3_input, n_in=nkerns[2] * 2 * 2, n_out=100, activation=T.tanh) ''' layer4 = LogisticRegression( input=layer3.output, n_in = num_hidden[0], n_out = num_hidden[1] ) params = layer0.params + layer1.params + layer2.params + layer3.params + layer4.params # using L2 regularization #L2_reg = sum([T.sum(i**2) for i in params if 'W' in i.name]) cost = layer4.negative_log_likelihood(np.argmax(y)) #cost += gamma * L2_reg ''' layer4_input = layer3.output.flatten(2) layer4 = HiddenLayer(rng, input=layer4_input, n_in=100, n_out=10, activation=T.nnet.softmax) params = layer0.params + layer1.params + layer2.params + layer3.params + layer4.params # using L2 regularization L2_reg = sum([T.sum(i**2) for i in params if i.name == 'W']) cost = T.sum((y - layer4.output)**2) / y.shape[0] cost += gamma * L2_reg grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] self.train_model = theano.function([x, y, learning_rate], cost, updates=updates) self.eval_net = theano.function([x], layer4.output) self.params = params
def __init__(self, N_tot, D_in, D_out, M, Domain_number, Ydim, Hiddenlayerdim1, Hiddenlayerdim2, num_MC): ######################################## # set type self.Xlabel = T.matrix('Xlabel') self.X = T.matrix('X') self.Y = T.matrix('Y') self.Weight = T.matrix('Weight') Ydim = self.Y.shape[1] N = self.X.shape[0] self.Ntot = N_tot ############################################# #BCなXの設定 後でこれもレイヤー化する MCsample 分を生成することにします。 self.hiddenLayer_x = HiddenLayer(rng=rng, input=self.X, n_in=D_in, n_out=Hiddenlayerdim1, activation=T.nnet.relu, number='_x') self.hiddenLayer_hidden = HiddenLayer(rng=rng, input=self.hiddenLayer_x.output, n_in=Hiddenlayerdim1, n_out=Hiddenlayerdim2, activation=T.nnet.relu, number='_h') self.hiddenLayer_m = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=D_out, activation=T.nnet.relu, number='_m') self.hiddenLayer_S = HiddenLayer(rng=rng, input=self.hiddenLayer_hidden.output, n_in=Hiddenlayerdim2, n_out=D_out, activation=T.nnet.relu, number='_S') self.loc_params = [] self.loc_params.extend(self.hiddenLayer_x.params) self.loc_params.extend(self.hiddenLayer_hidden.params) self.loc_params.extend(self.hiddenLayer_m.params) self.loc_params.extend(self.hiddenLayer_S.params) self.local_params = {} for i in self.loc_params: self.local_params[str(i)] = i #when we use the back constrained model.... srng = RandomStreams(seed=234) sample_latent_epsilon = srng.normal((num_MC, N, D_out)) latent_samples = sample_latent_epsilon * ( T.exp(self.hiddenLayer_S.output)** 0.5)[None, :, :] + self.hiddenLayer_m.output[None, :, :] #普通のsupervised な場合 MCサンプル分コピーしときます。 #self.Data_input=T.tile(self.X,(num_MC,1,1)) self.Data_input = latent_samples ########################################## ####X側の推論 #self.Gaussian_layer_X=KernelLayer(self.Data_input, D_in=D_out, D_out=D_in,num_MC=num_MC,inducing_number=M,Domain_number=None,Domain_consideration=False,number='_X') self.Gaussian_layer_X = KernelLayer(self.Data_input, D_in=D_out, D_out=D_in, num_MC=num_MC, inducing_number=M, Domain_number=Domain_number, Domain_consideration=True, number='_X') self.params = self.Gaussian_layer_X.params self.Z_params_list = self.Gaussian_layer_X.Z_params_list self.global_param_list = self.Gaussian_layer_X.global_params_list self.hyp_list = self.Gaussian_layer_X.hyp_params_list self.hidden_layer = self.Gaussian_layer_X.output ############################################################################################## ###Y側の計算 #self.Gaussian_layer_Y=KernelLayer(self.hidden_layer,D_in=D_out,D_out=Ydim,num_MC=num_MC,inducing_number=M,Domain_number=None,Domain_consideration=False,number='_Y') #self.params.extend(self.Gaussian_layer_Y.params) #self.Z_params_list.extend(self.Gaussian_layer_Y.Z_params_list) #self.global_param_list.extend(self.Gaussian_layer_Y.global_params_list) #self.hyp_list.extend(self.Gaussian_layer_Y.hyp_params_list) ########################################### ###目的関数 #self.LL = self.Gaussian_layer_X.liklihood_nodomain(self.X)*N_tot/(N) self.LL = self.Gaussian_layer_X.likelihood_domain( self.X, self.Xlabel) * N_tot / (N) self.KL_U = self.Gaussian_layer_X.KL_U #self.KL_UY=self.Gaussian_layer_Y.KL_U #y=self.Gaussian_layer_Y.softmax_class() #self.LLY = -T.mean(T.nnet.categorical_crossentropy(y, self.Y))*N #self.LLY=T.sum(T.log(T.maximum(T.sum(self.Y * y, 1), 1e-16))) #self.error = self.Gaussian_layer_Y.error_classification(self.Y) self.KL_latent_dim = self.KLD_X( self.hiddenLayer_m.output, T.exp( self.hiddenLayer_S.output)) * N_tot / (N) #pred = T.mean(self.Gaussian_layer_X.output,0) #self.error = (T.mean((self.Y - pred)**2,0))**0.5 ########################################### #domain checker MMD と クラス分類 #self.MMD=self.Gaussian_layer_Y.MMD_class_penalty(self.Y,self.Xlabel) ########################################## #パラメータの格納 self.hyp_params = {} for i in self.hyp_list: self.hyp_params[str(i)] = i self.Z_params = {} for i in self.Z_params_list: self.Z_params[str(i)] = i self.global_params = {} for i in self.global_param_list: self.global_params[str(i)] = i self.params.extend(self.loc_params) self.wrt = {} for i in self.params: self.wrt[str(i)] = i
def __init__(self, rng, batch_size=100, input_size=None, nkerns=[4, 4, 4], receptive_fields=((2, 8), (2, 8), (2, 8)), poolsizes=((1, 8), (1, 8), (1, 4)), full_hidden=16, n_out=10): """ """ self.x = T.matrix(name='x', dtype=theano.config.floatX ) # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of self.batch_size = theano.shared( value=batch_size, name='batch_size') #T.lscalar('batch_size') self.layers = [] self.params = [] for i in range(len(nkerns)): receptive_field = receptive_fields[i] if i == 0: featmap_size_after_downsample = input_size layeri_input = self.x.reshape( (batch_size, 1, featmap_size_after_downsample[0], featmap_size_after_downsample[1])) image_shape = (batch_size, 1, featmap_size_after_downsample[0], featmap_size_after_downsample[1]) filter_shape = (nkerns[i], 1, receptive_field[0], receptive_field[1]) else: layeri_input = self.layers[i - 1].output image_shape = (batch_size, nkerns[i - 1], featmap_size_after_downsample[0], featmap_size_after_downsample[1]) filter_shape = (nkerns[i], nkerns[i - 1], receptive_field[0], receptive_field[1]) layeri = LeNetConvPoolLayer(rng=rng, input=layeri_input, image_shape=image_shape, filter_shape=filter_shape, poolsize=poolsizes[i]) featmap_size_after_conv = get_featmap_size_after_conv( featmap_size_after_downsample, receptive_fields[i]) featmap_size_after_downsample = get_featmap_size_after_downsample( featmap_size_after_conv, poolsizes[i]) self.layers.append(layeri) self.params.extend(layeri.params) # fully connected layer print 'going to fully connected layer' layer_full_input = self.layers[-1].output.flatten(2) # construct a fully-connected sigmoidal layer layer_full = HiddenLayer(rng=rng, input=layer_full_input, n_in=nkerns[-1] * featmap_size_after_downsample[0] * featmap_size_after_downsample[1], n_out=full_hidden, activation=T.tanh) self.layers.append(layer_full) self.params.extend(layer_full.params) # classify the values of the fully-connected sigmoidal layer print 'going to output layer' self.logRegressionLayer = LogisticRegression( input=self.layers[-1].output, n_in=full_hidden, n_out=n_out) self.params.extend(self.logRegressionLayer.params) # the cost we minimize during training is the NLL of the model self.negative_log_likelihood = self.logRegressionLayer.negative_log_likelihood( self.y) self.cost = self.logRegressionLayer.negative_log_likelihood(self.y) self.errors = self.logRegressionLayer.errors(self.y) self.y_pred = self.logRegressionLayer.y_pred
def evaluate_lenet5(learning_rate=0.10, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[16, 16, 16, 12, 12, 12], batch_size=500): rng = numpy.random.RandomState(32324) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size index = T.lscalar() # index for each mini batch train_epoch = T.lscalar('train_epoch') x = T.matrix('x') y = T.ivector('y') # ------------------------------- Building Model ---------------------------------- print "...Building the model" layer_0_input = x.reshape((batch_size, 1, 28, 28)) # output image size = (28-5+1+)/1 = 24 layer_0 = LeNetConvPoolLayer(rng, input=layer_0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(1, 1)) #output image size = (24-3+1) = 22 layer_1 = LeNetConvPoolLayer(rng, input=layer_0.output, image_shape=(batch_size, nkerns[0], 24, 24), filter_shape=(nkerns[1], nkerns[0], 3, 3), poolsize=(1, 1)) #output image size = (22-3+1)/2 = 10 layer_2 = LeNetConvPoolLayer(rng, input=layer_1.output, image_shape=(batch_size, nkerns[1], 22, 22), filter_shape=(nkerns[2], nkerns[1], 3, 3), poolsize=(2, 2)) #output image size = (10-3+1)/2 = 4 layer_3 = LeNetConvPoolLayer(rng, input=layer_2.output, image_shape=(batch_size, nkerns[2], 10, 10), filter_shape=(nkerns[3], nkerns[2], 3, 3), poolsize=(2, 2)) #output image size = (4-3+2+1) = 4 layer_4 = LeNetConvPoolLayer(rng, input=layer_3.output, image_shape=(batch_size, nkerns[3], 4, 4), filter_shape=(nkerns[4], nkerns[3], 3, 3), poolsize=(1, 1), border_mode=1) #output image size = (4-3+1)/2 = 2 layer_5 = LeNetConvPoolLayer(rng, input=layer_4.output, image_shape=(batch_size, nkerns[4], 4, 4), filter_shape=(nkerns[5], nkerns[4], 3, 3), poolsize=(2, 2), border_mode=1) # make the input to hidden layer 2 dimensional layer_6_input = layer_5.output.flatten(2) layer_6 = HiddenLayer(rng, input=layer_6_input, n_in=nkerns[5] * 2 * 2, n_out=200, activation=T.tanh) layer_7 = LogReg(input=layer_6.output, n_in=200, n_out=10) teacher_p_y_given_x = theano.shared(numpy.asarray( pickle.load(open('prob_best_model.pkl', 'rb')), dtype=theano.config.floatX), borrow=True) p_y_given_x = T.matrix('p_y_given_x') e = theano.shared(value=0, name='e', borrow=True) cost = layer_7.neg_log_likelihood( y) + 2.0 / (e) * T.mean(-T.log(layer_7.p_y_given_x) * p_y_given_x - layer_7.p_y_given_x * T.log(p_y_given_x)) tg = theano.shared(numpy.asarray(pickle.load( open('modified_guided_data.pkl', 'rb')), dtype=theano.config.floatX), borrow=True) guiding_weights = T.tensor4('guiding_weights') #guide_cost = T.mean(-T.log(layer_3.output)*guiding_weights - layer_3.output*T.log(guiding_weights)) guide_cost = T.mean((layer_3.output - guiding_weights)**2) test_model = theano.function( [index], layer_7.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer_7.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # list of parameters params = layer_7.params + layer_6.params + layer_5.params + layer_4.params + layer_3.params + layer_2.params + layer_1.params + layer_0.params params_gl = layer_3.params + layer_2.params + layer_1.params + layer_0.params # import pdb # pdb.set_trace() grads_gl = T.grad(guide_cost, params_gl) updates_gl = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params_gl, grads_gl)] grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index, train_epoch], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], p_y_given_x: teacher_p_y_given_x[index], e: train_epoch }) train_till_guided_layer = theano.function( [index], guide_cost, updates=updates_gl, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], guiding_weights: tg[index] }, on_unused_input='ignore') # -----------------------------------------Starting Training ------------------------------ print('..... Training ') # for early stopping patience = 10000 patience_increase = 2 improvement_threshold = 0.95 validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = numpy.inf # initialising loss to be inifinite best_itr = 0 test_score = 0 start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print('training @ iter = ', iter) if epoch < n_epochs / 5: cost_ij_guided = train_till_guided_layer(minibatch_index) cost_ij = train_model(minibatch_index, epoch) if (iter + 1) % validation_frequency == 0: # compute loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) # import pdb # pdb.set_trace() with open('Student_6_terminal_out_2', 'a+') as f_: f_.write( 'epoch %i, minibatch %i/%i, validation error %f %% \n' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # check with best validation score till now if this_validation_loss < best_validation_loss: # improve if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_itr = iter test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) with open('Student_6_terminal_out_2', 'a+') as f_: f_.write( 'epoch %i, minibatch %i/%i, testing error %f %%\n' % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) with open('best_model_7layer_2.pkl', 'wb') as f: pickle.dump(params, f) with open('Results_student_6_2.txt', 'wb') as f1: f1.write(str(test_score * 100) + '\n') #if patience <= iter: # done_looping = True # break end_time = timeit.default_timer() with open('Student_6_terminal_out_2', 'a+') as f_: f_.write('Optimization complete\n') f_.write( 'Best validation score of %f %% obtained at iteration %i with test performance %f %% \n' % (best_validation_loss * 100., best_itr, test_score * 100)) f_.write('The code ran for %.2fm\n' % ((end_time - start_time) / 60.))
# image_shape=(batch_size, nkerns[0], 9, 9), # filter_shape=(nkerns[1], nkerns[0], 4, 4), # poolsize=(2, 2) # ) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer0.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer( rng, input=layer2_input, n_in=nkerns[0] * 2 * 2, n_out=50, activation=T.tanh ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=50, n_out=10) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network """ self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels # end-snippet-1 # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well) During finetuning we will finish # training the DBN by doing stochastic gradient descent on the # MLP. for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
# filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], layer1_input_img_size[0], layer1_input_img_size[1]), filter_shape=(nkerns[1], nkerns[0], filter1_shape[0], filter1_shape[1]), poolsize=(2, 2) \ ) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * layer2_input_img_size[0] * layer2_input_img_size[1], n_out=layer2_out, activation=T.tanh \ ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=layer2_out, n_out=N_OUT\ ) def load_trained_model(): global train_model_route global layer0_input global layer0 global layer1 global layer2_input global layer2 global layer3
def __init__(self, PV, numpy_rng, kind=2, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], h_activation=[], n_outs=10, corruption_levels=[0.1, 0.1]): """ This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the sdA :type n_layers_sizes: list of ints :param n_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type corruption_levels: list of float :param corruption_levels: amount of corruption to use for each layer """ self.PV = theano.shared(value=PV, borrow=True) self.sigmoid_layers = [] self.dA_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels self.z1 = T.matrix('z1') self.z2 = T.matrix('z2') # end-snippet-1 # The SdA is an MLP, for which all weights of intermediate layers # are shared with a different denoising autoencoders # We will first construct the SdA as a deep multilayer perceptron, # and when constructing each sigmoidal layer we also construct a # denoising autoencoder that shares weights with that layer # During pretraining we will train these autoencoders (which will # lead to chainging the weights of the MLP as well) # During finetunining we will finish training the SdA by doing # stochastich gradient descent on the MLP # start-snippet-2 for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden units of # the layer below or the input size if we are on the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the hidden # layer below or the input of the SdA if you are on the first # layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output activation = None if h_activation[i] == 1: activation = T.nnet.sigmoid if h_activation[i] == 2: activation = T.tanh sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=activation) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... # but we are going to only declare that the parameters of the # sigmoid_layers are parameters of the StackedDAA # the visible biases in the dA are parameters of those # dA, but not the SdA self.params.extend(sigmoid_layer.params) # Construct a denoising autoencoder that shared weights with this # layer dA_layer = dA(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, bhid=sigmoid_layer.b) self.dA_layers.append(dA_layer) # end-snippet-2 # We now need to add a output layer on top of the MLP self.OutLayer = HiddenLayer(rng=numpy_rng, input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs, activation=T.nnet.sigmoid, kind=kind) self.params.extend(self.OutLayer.params) # construct a function that implements one step of finetunining # compute the cost for second phase of training, # defined as the negative log likelihood self.finetune_cost = self.OutLayer.sq_loss(self.z1, self.z2) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.OutLayer.errors(self.y) self.p_y_given_x = self.OutLayer.output
def load_trained_model(): global train_model_route global layer0_input global layer0 global layer1 global layer2_input global layer2 global layer3 global layer0_input_img_size # ishape global filter0_shape global layer1_input_img_size global filter1_shape global layer2_input_img_size print "loading trained model" trained_model_pkl = open(train_model_route, 'r') trained_model_state_list = cPickle.load(trained_model_pkl) trained_model_state_array = numpy.load(trained_model_pkl) layer0_state, layer1_state, layer2_state, layer3_state = trained_model_state_array ###################### # BUILD ACTUAL MODEL # ###################### print '... loading the model' # Reshape matrix of rasterized images of shape (1, 50*50) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape( (batch_size, 1, layer0_input_img_size[0], layer0_input_img_size[1])) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, \ image_shape=(batch_size, 1, layer0_input_img_size[0], layer0_input_img_size[1]), \ filter_shape=(nkerns[0], 1, filter0_shape[0], filter0_shape[1]), poolsize=(2, 2), \ W=layer0_state[0], b=layer0_state[1] \ ) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], layer1_input_img_size[0], layer1_input_img_size[1]), filter_shape=(nkerns[1], nkerns[0], filter1_shape[0], filter1_shape[1]), poolsize=(2, 2), \ W=layer1_state[0], b=layer1_state[1] \ ) # the TanhLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * layer2_input_img_size[0] * layer2_input_img_size[1], n_out=layer2_out, activation=T.tanh, \ W=layer2_state[0], b=layer2_state[1] \ ) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=layer2_out, n_out=N_OUT, \ W=layer3_state[0], b=layer3_state[1] \ )
def __init__(self, numpy_rng=None, theano_rng=None, n_ins=784, gauss=True, hidden_layers_sizes=[400], n_outs=40, W_list=None, b_list=None): """This class is made to support a variable number of layers. :type numpy_rng: numpy.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_ins: int :param n_ins: dimension of the input to the DBN :type gauss: bool :param gauss: True if the first layer is Gaussian otherwise the first layer is Bernoullian :type hidden_layers_sizes: list of ints :param hidden_layers_sizes: intermediate layers size, must contain at least one value :type n_outs: int :param n_outs: dimension of the output of the network :type W_list: list of numpy.ndarray :param W_list: the list of weigths matrixes for each layer of the MLP; if None each matrix is randomly initialized :type b_list: list of numpy.ndarray :param b_list: the list of biases vectors for each layer of the MLP; if None each vector is randomly initialized """ self.n_ins = n_ins self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.stacked_layers_sizes = hidden_layers_sizes + [n_outs] self.n_layers = len(self.stacked_layers_sizes) assert self.n_layers > 0 if numpy_rng is None: numpy_rng = numpy.random.RandomState(123) if theano_rng is None: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data # the data is presented as rasterized images self.x = tensor.matrix('x') # The DBN is an MLP, for which all weights of intermediate # layers are shared with a different RBM. We will first # construct the DBN as a deep multilayer perceptron, and when # constructing each sigmoidal layer we also construct an RBM # that shares weights with that layer. During pretraining we # will train these RBMs (which will lead to chainging the # weights of the MLP as well). for i in range(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = self.stacked_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output n_in = input_size n_out= self.stacked_layers_sizes[i] print('Adding a layer with %i input and %i outputs' % (n_in, n_out)) if W_list is None: W = numpy.asarray(numpy_rng.uniform( low=-4.*numpy.sqrt(6. / (n_in + n_out)), high=4.*numpy.sqrt(6. / (n_in + n_out)), size=(n_in, n_out) ),dtype=theano.config.floatX) else: W = W_list[i] if b_list is None: b = numpy.zeros((n_out,), dtype=theano.config.floatX) else: b = b_list[i] sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=n_in, n_out=n_out, W=theano.shared(W,name='W',borrow=True), b=theano.shared(b,name='b',borrow=True), activation=tensor.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) # its arguably a philosophical question... but we are # going to only declare that the parameters of the # sigmoid_layers are parameters of the DBN. The visible # biases in the RBM are parameters of those RBMs, but not # of the DBN. self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer if i==0 and gauss: rbm_layer = GRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=self.stacked_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) else: rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=self.stacked_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer)
def __init__(self, numpy_rng, theano_rng=None, n_ins=784, hidden_layers_sizes=[500, 500], n_outs=10): self.sigmoid_layers = [] self.rbm_layers = [] self.params = [] self.n_layers = len(hidden_layers_sizes) assert self.n_layers > 0 if not theano_rng: theano_rng = RandomStreams(numpy_rng.randint(2 ** 30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.y = T.ivector('y') # the labels are presented as 1D vector # of [int] labels for i in xrange(self.n_layers): # construct the sigmoidal layer # the size of the input is either the number of hidden # units of the layer below or the input size if we are on # the first layer if i == 0: input_size = n_ins else: input_size = hidden_layers_sizes[i - 1] # the input to this layer is either the activation of the # hidden layer below or the input of the DBN if you are on # the first layer if i == 0: layer_input = self.x else: layer_input = self.sigmoid_layers[-1].output sigmoid_layer = HiddenLayer(rng=numpy_rng, input=layer_input, n_in=input_size, n_out=hidden_layers_sizes[i], activation=T.nnet.sigmoid) # add the layer to our list of layers self.sigmoid_layers.append(sigmoid_layer) self.params.extend(sigmoid_layer.params) # Construct an RBM that shared weights with this layer rbm_layer = RBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=layer_input, n_visible=input_size, n_hidden=hidden_layers_sizes[i], W=sigmoid_layer.W, hbias=sigmoid_layer.b) self.rbm_layers.append(rbm_layer) # We now need to add a logistic layer on top of the MLP self.logLayer = LogisticRegression( input=self.sigmoid_layers[-1].output, n_in=hidden_layers_sizes[-1], n_out=n_outs) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters self.errors = self.logLayer.errors(self.y)
def evaluate_lenet5(learning_rate=0.095, n_epochs=2000, nkerns=[20, 50], batch_size=110): """ Demonstrates lenet on dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 50, 50)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, 50, 50), filter_shape=(nkerns[0], 1, 5, 5), poolsize=(2, 2)) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, nkerns[0], 23, 23), filter_shape=(nkerns[1], nkerns[0], 11, 11), poolsize=(2, 2)) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layer2_input = layer1.output.flatten(2) # construct a fully-connected sigmoidal layer layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 6 * 6, n_out=110, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=110, n_out=8) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 1393 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print('Optimization complete.') print( 'Best validation error of %f %% obtained at iteration %i, ' 'with test error %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.01, n_epochs=4, emb_size=300, batch_size=50, describ_max_len=20, type_size=12, filter_size=[3, 5], maxSentLen=100, hidden_size=[300, 300]): model_options = locals().copy() print "model options", model_options emb_root = '/save/wenpeng/datasets/LORELEI/multi-lingual-emb/2018-il9-il10/multi-emb/' test_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9-setE-as-test-input_ner_filtered_w2.txt' output_file_path = '/save/wenpeng/datasets/LORELEI/il9/il9_system_output_onlyMT_BBN_epoch4.json' seed = 1234 np.random.seed(seed) rng = np.random.RandomState( seed) #random seed, control the model generates the same results srng = T.shared_randomstreams.RandomStreams(rng.randint(seed)) word2id = {} # all_sentences, all_masks, all_labels, all_other_labels, word2id=load_BBN_il5Trans_il5_dataset(maxlen=maxSentLen) #minlen, include one label, at least one word in the sentence train_p1_sents, train_p1_masks, train_p1_labels, word2id = load_trainingData_types( word2id, maxSentLen) train_p2_sents, train_p2_masks, train_p2_labels, train_p2_other_labels, word2id = load_trainingData_types_plus_others( word2id, maxSentLen) test_sents, test_masks, test_lines, word2id = load_official_testData_only_MT( word2id, maxSentLen, test_file_path) label_sent, label_mask = load_SF_type_descriptions(word2id, type_size, describ_max_len) label_sent = np.asarray(label_sent, dtype='int32') label_mask = np.asarray(label_mask, dtype=theano.config.floatX) train_p1_sents = np.asarray(train_p1_sents, dtype='int32') train_p1_masks = np.asarray(train_p1_masks, dtype=theano.config.floatX) train_p1_labels = np.asarray(train_p1_labels, dtype='int32') train_p1_size = len(train_p1_labels) train_p2_sents = np.asarray(train_p2_sents, dtype='int32') train_p2_masks = np.asarray(train_p2_masks, dtype=theano.config.floatX) train_p2_labels = np.asarray(train_p2_labels, dtype='int32') train_p2_other_labels = np.asarray(train_p2_other_labels, dtype='int32') train_p2_size = len(train_p2_labels) ''' combine train_p1 and train_p2 ''' train_sents = np.concatenate([train_p1_sents, train_p2_sents], axis=0) train_masks = np.concatenate([train_p1_masks, train_p2_masks], axis=0) train_labels = np.concatenate([train_p1_labels, train_p2_labels], axis=0) train_size = train_p1_size + train_p2_size test_sents = np.asarray(test_sents, dtype='int32') test_masks = np.asarray(test_masks, dtype=theano.config.floatX) # test_labels=np.asarray(all_labels[2], dtype='int32') test_size = len(test_sents) vocab_size = len(word2id) + 1 # add one zero pad index rand_values = rng.normal( 0.0, 0.01, (vocab_size, emb_size)) #generate a matrix by Gaussian distribution rand_values[0] = np.array(np.zeros(emb_size), dtype=theano.config.floatX) id2word = {y: x for x, y in word2id.iteritems()} word2vec = load_fasttext_multiple_word2vec_given_file([ emb_root + '100k-ENG-multicca.300.ENG.txt', emb_root + '100k-IL9-multicca.d300.IL9.txt' ], 300) rand_values = load_word2vec_to_init(rand_values, id2word, word2vec) embeddings = theano.shared( value=np.array(rand_values, dtype=theano.config.floatX), borrow=True ) #wrap up the python variable "rand_values" into theano variable #now, start to build the input form of the model sents_id_matrix = T.imatrix('sents_id_matrix') sents_mask = T.fmatrix('sents_mask') labels = T.imatrix('labels') #batch*12 other_labels = T.imatrix() #batch*4 des_id_matrix = T.imatrix() des_mask = T.fmatrix() ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' common_input = embeddings[sents_id_matrix.flatten()].reshape( (batch_size, maxSentLen, emb_size)).dimshuffle( 0, 2, 1) #the input format can be adapted into CNN or GRU or LSTM bow_emb = T.sum(common_input * sents_mask.dimshuffle(0, 'x', 1), axis=2) repeat_common_input = T.repeat( normalize_tensor3_colwise(common_input), type_size, axis=0) #(batch_size*type_size, emb_size, maxsentlen) des_input = embeddings[des_id_matrix.flatten()].reshape( (type_size, describ_max_len, emb_size)).dimshuffle(0, 2, 1) bow_des = T.sum(des_input * des_mask.dimshuffle(0, 'x', 1), axis=2) #(tyope_size, emb_size) repeat_des_input = T.tile( normalize_tensor3_colwise(des_input), (batch_size, 1, 1)) #(batch_size*type_size, emb_size, maxsentlen) conv_W, conv_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W2, conv_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) multiCNN_para = [conv_W, conv_b, conv_W2, conv_b2] conv_att_W, conv_att_b = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[0])) conv_W_context, conv_b_context = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) conv_att_W2, conv_att_b2 = create_conv_para(rng, filter_shape=(hidden_size[0], 1, emb_size, filter_size[1])) conv_W_context2, conv_b_context2 = create_conv_para( rng, filter_shape=(hidden_size[0], 1, emb_size, 1)) ACNN_para = [ conv_att_W, conv_att_b, conv_W_context, conv_att_W2, conv_att_b2, conv_W_context2 ] ''' multi-CNN ''' conv_model = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), W=conv_W, b=conv_b ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings = conv_model.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size conv_model2 = Conv_with_Mask( rng, input_tensor3=common_input, mask_matrix=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), W=conv_W2, b=conv_b2 ) #mutiple mask with the conv_out to set the features by UNK to zero sent_embeddings2 = conv_model2.maxpool_vec #(batch_size, hidden_size) # each sentence then have an embedding of length hidden_size ''' GRU ''' U1, W1, b1 = create_GRU_para(rng, emb_size, hidden_size[0]) GRU_NN_para = [ U1, W1, b1 ] #U1 includes 3 matrices, W1 also includes 3 matrices b1 is bias # gru_input = common_input.dimshuffle((0,2,1)) #gru requires input (batch_size, emb_size, maxSentLen) gru_layer = GRU_Batch_Tensor_Input_with_Mask(common_input, sents_mask, hidden_size[0], U1, W1, b1) gru_sent_embeddings = gru_layer.output_sent_rep # (batch_size, hidden_size) ''' ACNN ''' attentive_conv_layer = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[0]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W, b=conv_att_b, W_context=conv_W_context, b_context=conv_b_context) sent_att_embeddings = attentive_conv_layer.attentive_maxpool_vec_l attentive_conv_layer2 = Attentive_Conv_for_Pair( rng, origin_input_tensor3=common_input, origin_input_tensor3_r=common_input, input_tensor3=common_input, input_tensor3_r=common_input, mask_matrix=sents_mask, mask_matrix_r=sents_mask, image_shape=(batch_size, 1, emb_size, maxSentLen), image_shape_r=(batch_size, 1, emb_size, maxSentLen), filter_shape=(hidden_size[0], 1, emb_size, filter_size[1]), filter_shape_context=(hidden_size[0], 1, emb_size, 1), W=conv_att_W2, b=conv_att_b2, W_context=conv_W_context2, b_context=conv_b_context2) sent_att_embeddings2 = attentive_conv_layer2.attentive_maxpool_vec_l ''' cross-DNN-dataless ''' #first map label emb into hidden space HL_layer_1_W, HL_layer_1_b = create_HiddenLayer_para( rng, emb_size, hidden_size[0]) HL_layer_1_params = [HL_layer_1_W, HL_layer_1_b] HL_layer_1 = HiddenLayer(rng, input=bow_des, n_in=emb_size, n_out=hidden_size[0], W=HL_layer_1_W, b=HL_layer_1_b, activation=T.tanh) des_rep_hidden = HL_layer_1.output #(type_size, hidden_size) dot_dnn_dataless_1 = T.tanh(sent_embeddings.dot( des_rep_hidden.T)) #(batch_size, type_size) dot_dnn_dataless_2 = T.tanh(sent_embeddings2.dot(des_rep_hidden.T)) ''' dataless cosine ''' cosine_scores = normalize_matrix_rowwise(bow_emb).dot( normalize_matrix_rowwise(bow_des).T) cosine_score_matrix = T.nnet.sigmoid( cosine_scores) #(batch_size, type_size) ''' dataless top-30 fine grained cosine ''' fine_grained_cosine = T.batched_dot( repeat_common_input.dimshuffle(0, 2, 1), repeat_des_input) #(batch_size*type_size,maxsentlen,describ_max_len) fine_grained_cosine_to_matrix = fine_grained_cosine.reshape( (batch_size * type_size, maxSentLen * describ_max_len)) sort_fine_grained_cosine_to_matrix = T.sort(fine_grained_cosine_to_matrix, axis=1) top_k_simi = sort_fine_grained_cosine_to_matrix[:, -30:] # (batch_size*type_size, 5) max_fine_grained_cosine = T.mean(top_k_simi, axis=1) top_k_cosine_scores = max_fine_grained_cosine.reshape( (batch_size, type_size)) top_k_score_matrix = T.nnet.sigmoid(top_k_cosine_scores) acnn_LR_input = T.concatenate([ dot_dnn_dataless_1, dot_dnn_dataless_2, cosine_score_matrix, top_k_score_matrix, sent_embeddings, sent_embeddings2, gru_sent_embeddings, sent_att_embeddings, sent_att_embeddings2, bow_emb ], axis=1) acnn_LR_input_size = hidden_size[0] * 5 + emb_size + 4 * type_size #classification layer, it is just mapping from a feature vector of size "hidden_size" to a vector of only two values: positive, negative acnn_U_a, acnn_LR_b = create_LR_para(rng, acnn_LR_input_size, 12) acnn_LR_para = [acnn_U_a, acnn_LR_b] acnn_layer_LR = LogisticRegression( rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=12, W=acnn_U_a, b=acnn_LR_b ) #basically it is a multiplication between weight matrix and input feature vector acnn_score_matrix = T.nnet.sigmoid( acnn_layer_LR.before_softmax) #batch * 12 acnn_prob_pos = T.where(labels < 1, 1.0 - acnn_score_matrix, acnn_score_matrix) acnn_loss = -T.mean(T.log(acnn_prob_pos)) acnn_other_U_a, acnn_other_LR_b = create_LR_para(rng, acnn_LR_input_size, 16) acnn_other_LR_para = [acnn_other_U_a, acnn_other_LR_b] acnn_other_layer_LR = LogisticRegression(rng, input=acnn_LR_input, n_in=acnn_LR_input_size, n_out=16, W=acnn_other_U_a, b=acnn_other_LR_b) acnn_other_prob_matrix = T.nnet.softmax( acnn_other_layer_LR.before_softmax.reshape((batch_size * 4, 4))) acnn_other_prob_tensor3 = acnn_other_prob_matrix.reshape( (batch_size, 4, 4)) acnn_other_prob = acnn_other_prob_tensor3[ T.repeat(T.arange(batch_size), 4), T.tile(T.arange(4), (batch_size)), other_labels.flatten()] acnn_other_field_loss = -T.mean(T.log(acnn_other_prob)) params = multiCNN_para + GRU_NN_para + ACNN_para + acnn_LR_para + HL_layer_1_params # put all model parameters together cost = acnn_loss + 1e-4 * ((conv_W**2).sum() + (conv_W2**2).sum() + (conv_att_W**2).sum() + (conv_att_W2**2).sum()) updates = Gradient_Cost_Para(cost, params, learning_rate) other_paras = params + acnn_other_LR_para cost_other = cost + acnn_other_field_loss other_updates = Gradient_Cost_Para(cost_other, other_paras, learning_rate) ''' testing ''' ensemble_NN_scores = acnn_score_matrix #T.max(T.concatenate([att_score_matrix.dimshuffle('x',0,1), score_matrix.dimshuffle('x',0,1), acnn_score_matrix.dimshuffle('x',0,1)],axis=0),axis=0) # ''' # majority voting, does not work # ''' # binarize_NN = T.where(ensemble_NN_scores > 0.5, 1, 0) # binarize_dataless = T.where(cosine_score_matrix > 0.5, 1, 0) # binarize_dataless_finegrained = T.where(top_k_score_matrix > 0.5, 1, 0) # binarize_conc = T.concatenate([binarize_NN.dimshuffle('x',0,1), binarize_dataless.dimshuffle('x',0,1),binarize_dataless_finegrained.dimshuffle('x',0,1)],axis=0) # sum_binarize_conc = T.sum(binarize_conc,axis=0) # binarize_prob = T.where(sum_binarize_conc > 0.0, 1, 0) # ''' # sum up prob, works # ''' # ensemble_scores_1 = 0.6*ensemble_NN_scores+0.4*top_k_score_matrix # binarize_prob = T.where(ensemble_scores_1 > 0.3, 1, 0) ''' sum up prob, works ''' ensemble_scores = ensemble_NN_scores #0.6*ensemble_NN_scores+0.4*0.5*(cosine_score_matrix+top_k_score_matrix) binarize_prob = T.where(ensemble_scores > 0.3, 1, 0) ''' test for other fields ''' sum_tensor3 = acnn_other_prob_tensor3 #(batch, 4, 3) #train_model = theano.function([sents_id_matrix, sents_mask, labels], cost, updates=updates, on_unused_input='ignore') train_p1_model = theano.function( [sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask], cost, updates=updates, allow_input_downcast=True, on_unused_input='ignore') train_p2_model = theano.function([ sents_id_matrix, sents_mask, labels, des_id_matrix, des_mask, other_labels ], cost_other, updates=other_updates, allow_input_downcast=True, on_unused_input='ignore') test_model = theano.function( [sents_id_matrix, sents_mask, des_id_matrix, des_mask], [binarize_prob, ensemble_scores, sum_tensor3], allow_input_downcast=True, on_unused_input='ignore') ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 50000000000 # look as this many examples regardless start_time = time.time() mid_time = start_time past_time = mid_time epoch = 0 done_looping = False n_train_batches = train_size / batch_size train_batch_start = list( np.arange(n_train_batches) * batch_size) + [train_size - batch_size] n_train_p2_batches = train_p2_size / batch_size train_p2_batch_start = list(np.arange(n_train_p2_batches) * batch_size) + [train_p2_size - batch_size] n_test_batches = test_size / batch_size n_test_remain = test_size % batch_size test_batch_start = list( np.arange(n_test_batches) * batch_size) + [test_size - batch_size] train_p2_batch_start_set = set(train_p2_batch_start) # max_acc_dev=0.0 # max_meanf1_test=0.0 # max_weightf1_test=0.0 train_indices = range(train_size) train_p2_indices = range(train_p2_size) cost_i = 0.0 other_cost_i = 0.0 min_mean_frame = 100.0 while epoch < n_epochs: epoch = epoch + 1 random.Random(100).shuffle(train_indices) random.Random(100).shuffle(train_p2_indices) iter_accu = 0 for batch_id in train_batch_start: #for each batch # iter means how many batches have been run, taking into loop iter = (epoch - 1) * n_train_batches + iter_accu + 1 iter_accu += 1 train_id_batch = train_indices[batch_id:batch_id + batch_size] cost_i += train_p1_model(train_sents[train_id_batch], train_masks[train_id_batch], train_labels[train_id_batch], label_sent, label_mask) if batch_id in train_p2_batch_start_set: train_p2_id_batch = train_p2_indices[batch_id:batch_id + batch_size] other_cost_i += train_p2_model( train_p2_sents[train_p2_id_batch], train_p2_masks[train_p2_id_batch], train_p2_labels[train_p2_id_batch], label_sent, label_mask, train_p2_other_labels[train_p2_id_batch]) # else: # random_batch_id = random.choice(train_p2_batch_start) # train_p2_id_batch = train_p2_indices[random_batch_id:random_batch_id+batch_size] # other_cost_i+=train_p2_model( # train_p2_sents[train_p2_id_batch], # train_p2_masks[train_p2_id_batch], # train_p2_labels[train_p2_id_batch], # label_sent, # label_mask, # train_p2_other_labels[train_p2_id_batch] # ) #after each 1000 batches, we test the performance of the model on all test data if iter % 20 == 0: print 'Epoch ', epoch, 'iter ' + str( iter) + ' average cost: ' + str(cost_i / iter), str( other_cost_i / iter), 'uses ', (time.time() - past_time) / 60.0, 'min' past_time = time.time() pred_types = [] pred_confs = [] pred_others = [] for i, test_batch_id in enumerate( test_batch_start): # for each test batch pred_types_i, pred_conf_i, pred_fields_i = test_model( test_sents[test_batch_id:test_batch_id + batch_size], test_masks[test_batch_id:test_batch_id + batch_size], label_sent, label_mask) if i < len(test_batch_start) - 1: pred_types.append(pred_types_i) pred_confs.append(pred_conf_i) pred_others.append(pred_fields_i) else: pred_types.append(pred_types_i[-n_test_remain:]) pred_confs.append(pred_conf_i[-n_test_remain:]) pred_others.append(pred_fields_i[-n_test_remain:]) pred_types = np.concatenate(pred_types, axis=0) pred_confs = np.concatenate(pred_confs, axis=0) pred_others = np.concatenate(pred_others, axis=0) mean_frame = generate_2018_official_output( test_lines, output_file_path, pred_types, pred_confs, pred_others, min_mean_frame) if mean_frame < min_mean_frame: min_mean_frame = mean_frame print '\t\t\t test over, min_mean_frame:', min_mean_frame print 'Epoch ', epoch, 'uses ', (time.time() - mid_time) / 60.0, 'min' mid_time = time.time() #print 'Batch_size: ', update_freq end_time = time.time() print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))