def pretraining_functions(self, train_set_x, train_set_y, batch_size): index = tensor.lscalar('index') index = tensor.lscalar('index') corruption_level = tensor.scalar('corruption') corruption_level = tensor.scalar('corruption') learning_rate = tensor.scalar('lr') learning_rate = tensor.scalar('lr') switch = tensor.iscalar('switch') n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for sugar in self.sugar_layers: cost, updates = sugar.get_cost_updates(corruption_level, learning_rate, switch) fn = function(inputs=[ index, Param(corruption_level, default=0.2), Param(learning_rate, default=0.1), Param(switch, default=1) ], outputs=[cost], updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end], self.y: train_set_y[batch_begin:batch_end] }, on_unused_input='ignore') pretrain_fns.append(fn) return pretrain_fns
def greedy_pre_training(self, train_x, batch_size=1, pre_lr=0.25,dropout=True,denoising=False): pre_train_fns = [] index = T.lscalar('index') lam = T.scalar('lam') beta = T.scalar('beta') rho = T.scalar('rho') i = 0 print "\nCompiling functions for DA layers..." for sa in self.sa_layers: cost, updates = sa.get_cost_and_updates(l_rate=pre_lr, lam=lam, beta=beta, rho=rho, cost_fn=self.cost_fn_names[0], corruption_level=self.corruption_levels[i],denoising=denoising) #the givens section in this line set the self.x that we assign as input to the initial # curr_input value be a small batch rather than the full batch. # however, we don't need to set subsequent inputs to be an only a minibatch # because if self.x is only a portion, you're going to get the hidden activations # corresponding to that small batch of inputs. # Therefore, setting self.x to be a mini-batch is enough to make all the subsequents use # hidden activations corresponding to that mini batch of self.x sa_fn = function(inputs=[index, Param(lam, default=0.25), Param(beta, default=0.25), Param(rho, default=0.25)], outputs=cost, updates=updates, givens={ self.x: train_x[index * batch_size: (index+1) * batch_size] } ) pre_train_fns.append(sa_fn) i = i+1 return pre_train_fns
def test_examples_7(self): from theano import Param x, y, w = T.dscalars('x', 'y', 'w') z = (x + y) * w f = function([x, Param(y, default=1), Param(w, default=2, name='w_by_name')], z) assert f(33) == array(68.0) assert f(33, 2) == array(70.0) assert f(33, 0, 1) == array(33.0) assert f(33, w_by_name=1) == array(34.0) assert f(33, w_by_name=1, y=0) == array(33.0)
def __init__(self, param_dict): self.param_dict = param_dict self.training_batch_size = param_dict['training_batch_size'] nkerns = param_dict['nkerns'] recept_width = param_dict['recept_width'] pool_width = param_dict['pool_width'] stride = param_dict['stride'] dropout_prob = param_dict['dropout_prob'] weight_decay = param_dict['l2_reg'] activation = param_dict['activation'] weights_variance = param_dict['weights_variance'] n_channels = param_dict['n_channels'] n_timesteps = param_dict['n_timesteps'] n_fbins = param_dict['n_fbins'] global_pooling = param_dict['global_pooling'] rng = np.random.RandomState(23455) self.training_mode = T.iscalar('training_mode') self.x = T.tensor4('x') self.y = T.bvector('y') self.batch_size = theano.shared(self.training_batch_size) self.input = self.x.reshape((self.batch_size, 1, n_channels * n_fbins, n_timesteps)) self.feature_extractor = FeatureExtractor(rng, self.input, nkerns, recept_width, pool_width, stride, self.training_mode, dropout_prob[0], activation, weights_variance, n_channels, n_timesteps, n_fbins, global_pooling) self.classifier = SoftmaxLayer(rng=rng, input=self.feature_extractor.output, n_in=nkerns[-1], training_mode=self.training_mode, dropout_prob=dropout_prob[-1]) self.weights = self.feature_extractor.weights + self.classifier.weights # ---------------------- BACKPROP self.cost = self.classifier.cross_entropy_cost(self.y) self.cost = self.classifier.cross_entropy_cost(self.y) L2_sqr = sum((weight ** 2).sum() for weight in self.weights[::2]) self.grads = T.grad(self.cost + weight_decay * L2_sqr, self.weights) self.updates = self.adadelta_updates(self.grads, self.weights) # self.updates = self.nesterov_momentum(self.grads, self.weights) # --------------------- FUNCTIONS self.train_model = theano.function([self.x, self.y, Param(self.training_mode, default=1)], outputs=self.cost, updates=self.updates) self.validate_model = theano.function([self.x, self.y, Param(self.training_mode, default=0)], self.cost) self.test_model = theano.function([self.x, Param(self.training_mode, default=0)], self.classifier.p_y_given_x[:, 1])
def plot_features(subject, data_path, model_path, test_labels, dataset='test'): with open(model_path + '/' + subject + '.pickle', 'rb') as f: state_dict = cPickle.load(f) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) scalers = state_dict['scalers'] if dataset == 'test': d = load_test_data(data_path, subject) x = d['x'] y = test_labels['preictal'] elif dataset == 'train': d = load_train_data(data_path, subject) x, y = d['x'], d['y'] else: raise ValueError('dataset') x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn.batch_size.set_value(x.shape[0]) get_features = theano.function([cnn.x, Param(cnn.training_mode, default=0)], cnn.feature_extractor.output, allow_input_downcast=True) logits_test = get_features(x) model = TSNE(n_components=2, random_state=0) z = model.fit_transform(np.float64(logits_test)) plt.scatter(z[:, 0], z[:, 1], s=60, c=y) plt.show()
def test_examples_6(self): from theano import Param x, y = T.dscalars('x', 'y') z = x + y f = function([x, Param(y, default=1)], z) assert f(33) == array(34.0) assert f(33, 2) == array(35.0)
def __init__(self, num_features, num_classes): self.num_features = num_features self.num_classes = num_classes # Set up Theano network for the model # Features: (m, num_features) x = T.matrix("x") # Classes (one-hot): (m, num_classes) y = T.ivector("y") # Weights and bias, randomly initialized self.theta = theano.shared(value=numpy.zeros( num_features * num_classes, dtype=theano.config.floatX), name='theta', borrow=True) w = self.theta.reshape((num_features, num_classes)) # Don't include bias: we put 1s in the input instead #b = theano.shared(numpy.zeros(num_classes), name="b") # Other training params self.reg_coef = T.scalar("reg") # Construct Theano expression graph activation = T.dot(x, w) # Softmax activations to get a probability distribution over the classes class_probs = nnet.softmax(activation) # The predicted class is that with highest activation (no need to do the softmax for this) prediction = T.argmax(activation, axis=1) # Cross-entropy loss function #xent = nnet.categorical_crossentropy(class_probs, y) xent = -T.mean(T.log(class_probs)[T.arange(y.shape[0]), y]) # The cost to minimize, including L2 regularization cost = xent + self.reg_coef * (w[1:, :]**2).sum() # Compute the gradient of the cost self.gw = T.grad(cost, w) self.gtheta = T.grad(cost, self.theta) # Error in terms of hard predictions (accuracy) error = T.mean(T.neq(prediction, y)) # Compile self._predict_fn = theano.function(inputs=[x], outputs=prediction) self._prob_fn = theano.function(inputs=[x], outputs=class_probs) self._cost_fn = theano.function( inputs=[x, y], outputs=xent, ) self._cost_fn_reg = theano.function( inputs=[x, y, Param(self.reg_coef, default=0.01)], outputs=cost, ) self._error_fn = theano.function(inputs=[x, y], outputs=error) self.w = w self.x = x self.y = y self._cost_without_reg = xent
def fine_tuning(self, datasets, batch_size=1, fine_lr=0.2): (train_set_x, train_set_y) = datasets[0] (valid_set_x, valid_set_y) = datasets[1] (test_set_x, test_set_y) = datasets[2] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch gparams = T.grad(self.fine_cost, self.thetas) updates = [(param, param - gparam * fine_lr) for param, gparam in zip(self.thetas, gparams)] fine_tuen_fn = function( inputs=[index, Param(self.lam_fine_tune, default=0.25)], outputs=self.fine_cost, updates=updates, givens={ self.x: train_set_x[index * self.batch_size:(index + 1) * self.batch_size], self.y: train_set_y[index * self.batch_size:(index + 1) * self.batch_size] }) validation_fn = function( inputs=[index], outputs=self.error, givens={ self.x: valid_set_x[index * batch_size:(index + 1) * batch_size], self.y: valid_set_y[index * batch_size:(index + 1) * batch_size] }, name='valid') def valid_score(): return [validation_fn(i) for i in xrange(n_valid_batches)] return fine_tuen_fn, valid_score
def test(): # multiple inputs, multiple outputs a, b = T.dmatrices('a', 'b') diff = a - b abs_diff = T.abs_(diff) sqr_diff = diff ** 2 f = function([a, b], [diff, abs_diff, sqr_diff]) h, i, j = f([[0, 1], [2, 3]], [[4, 5], [6, 7]]) # default value for function arguments a, b = T.dscalars('a', 'b') z = a + b f = function([a, Param(b, default=1)], z) print f(1, b=2) print f(1) print f(1, 2) # shared variable state = shared(0) inc = T.lscalar('inc') # state is int64 by default accumulator = function([inc], state, updates=[(state, state + inc)]) print accumulator(300) print state.get_value()
def __init__(self, network, regularize_bias=False, optimization="sgd"): """ optimization selects the type of optimization algorithm used. The default, 'sgd', is standard stochastic gradient descent. Currently, the only alternative is 'adadelta', which implements AdaDelta updates. """ self.network = network x, y = network.x, network.y # Training params self.learning_rate = T.scalar("learning_rate") self.reg_coef = T.scalar("reg_coef") self.class_weights = T.vector("class_weights", dtype="float64") # Needed for AdaDelta self.decay = T.scalar("decay") self.optimization = optimization # Cross-entropy loss function log_probs = T.log(network.class_probs[T.arange(y.shape[0]), y]) xent = -T.mean(log_probs * self.class_weights[y]) #xent = -self.network._mean_per_class_target_log_prob # The cost to minimize, including L2 regularization cost = xent + self.reg_coef * ((network.w0**2.).mean() + (network.w1**2.).mean()) if regularize_bias: cost += self.reg_coef * ((network.b0**2.).mean() + (network.b1**2.).mean()) parameters = [network.w0, network.w1, network.b0, network.b1] # Compute the gradient of the cost wrt the parameters gradients = [T.grad(cost, param) for param in parameters] if optimization == "adadelta": # AdaDelta updates, based on Shawn Tan's implementation: # https://blog.wtf.sg/2014/08/28/implementing-adadelta/ extra_params = [ Param(self.learning_rate, default=1e-6), Param(self.decay, default=0.95) ] # Store intermediate updates gradients_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] deltas_sq = [ theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters ] # Calculates the new "average" delta for the next iteration gradients_sq_new = [ self.decay * g_sq + (1 - self.decay) * (g**2) for g_sq, g in izip(gradients_sq, gradients) ] # Calculates the step in direction # The square root is an approximation to getting the RMS for the average value deltas = [(T.sqrt(d_sq + self.learning_rate) / T.sqrt(g_sq + self.learning_rate)) * grad for d_sq, g_sq, grad in izip(deltas_sq, gradients_sq_new, gradients)] # calculates the new "average" deltas for the next step. deltas_sq_new = [ self.decay * d_sq + (1 - self.decay) * (d**2) for d_sq, d in izip(deltas_sq, deltas) ] # Prepare the updates list updates = ( # Update the squared gradients zip(gradients_sq, gradients_sq_new) + # Update the squared deltas zip(deltas_sq, deltas_sq_new) + # Update the model's actual parameters [(param, param - delta) for (param, delta) in izip(parameters, deltas)]) else: # Standard SGD updates extra_params = [Param(self.learning_rate, default=0.1)] updates = [(param, param - self.learning_rate * grad) for (param, grad) in zip(parameters, gradients)] # Compile self._train_fn = theano.function( inputs=[ x, y, Param(self.reg_coef, default=0.01), Param(self.class_weights, default=numpy.ones(network.num_classes, dtype=numpy.float64)), ] + extra_params, outputs=T.sum(log_probs), updates=updates, givens=[(network.output_bias, 1), (network.hidden_bias, 1)], # Bias enabled for training #on_unused_input="warn", ) self._cost_fn = theano.function( inputs=[ x, y, Param(self.class_weights, default=numpy.ones(network.num_classes, dtype=numpy.float64)), Param(self.reg_coef, default=0.01) ], outputs=cost, givens=[(network.output_bias, 1), (network.hidden_bias, 1)], mode='FAST_RUN', #on_unused_input="warn", ) self._costs = theano.function( inputs=[ x, y, Param(self.class_weights, default=numpy.ones(network.num_classes, dtype=numpy.float64)) ], outputs=xent, givens=[(network.output_bias, 1), (network.hidden_bias, 1)], #on_unused_input="warn", )
def validate(self, train_set, valid_set, init_learning_rate, max_iters, validation_frequency, improvement_threshold): train_set_iterator = TrainSetIterator(train_set, self.training_batch_size) n_batches = train_set_iterator.get_number_of_batches() print 'training set \nshape:', train_set[ 1].shape, 'number of seizures:', np.sum( train_set[1]), 'number of batches:', n_batches valid_set_x, valid_set_y = valid_set valid_size = valid_set_x.shape[0] print 'validation set \nshape:', valid_size, 'number of seizures:', np.sum( valid_set[1]) learning_rate = theano.shared(np.float32(init_learning_rate)) learning_rate_decay = np.float32(init_learning_rate / max_iters) cost = self.layer3.negative_log_likelihood(self.y) grads = T.grad(cost, self.params) #self._check_num_gradient(train_set_iterator.next()) #updates = self._momentum_updates(grads, learning_rate) #updates = self._rmsprop_updates(grads,learning_rate) updates = self._vanilla_updates(grads, learning_rate) #-------------------- FUNCTIONS tp, tn = self.layer3.tptn(self.y) fp, fn = self.layer3.fpfn(self.y) train_model = theano.function( [self.x, self.y, Param(self.training_mode, default=1)], [cost, self.layer3.p_y_given_x, self.layer2.output], updates=updates, on_unused_input='ignore') validate_model = theano.function( [self.x, self.y, Param(self.training_mode, default=0)], [cost, tp, tn, fp, fn], on_unused_input='ignore') #------------------------------ TRAINING iter = 0 epoch = 0 best_cost = np.inf best_iter = 0 patience_increase = 2 patience = 150 * validation_frequency #50 done_looping = False start_time = time.clock() while not done_looping: epoch += 1 for x, y in train_set_iterator: iter += 1 train_model(x, y) learning_rate.set_value( max(learning_rate.get_value() - learning_rate_decay, 0.0)) # ------------------------ VALIDATION if iter % validation_frequency == 0: self.batch_size.set_value(valid_size) [valid_cost, tp, tn, fp, fn] = validate_model(valid_set_x, valid_set_y) print epoch, iter, tp, tn, fp, fn, valid_cost, learning_rate.get_value( ) self.batch_size.set_value(self.training_batch_size) if valid_cost < best_cost: if valid_cost < best_cost * improvement_threshold: patience = max(patience, iter * patience_increase) best_iter = iter best_cost = valid_cost if iter >= max_iters or patience <= iter: done_looping = True break print 'time:', (time.clock() - start_time) / 60. print 'best_iter:', best_iter return best_iter
def test(self, train_set, test_set, init_learning_rate, learning_rate_decay, opt_iters, out_file): train_set_iterator = TrainSetIterator(train_set, self.training_batch_size) n_batches = train_set_iterator.get_number_of_batches() print 'training set \nshape:', train_set[ 1].shape, 'number of seizures:', np.sum( train_set[1]), 'number of batches:', n_batches test_set_x, test_set_y = test_set test_size = test_set_x.shape[0] print 'test set \nshape:', test_size, 'number of seizures:', np.sum( test_set[1]) learning_rate = theano.shared(np.float32(init_learning_rate)) learning_rate_decay = np.float32(learning_rate_decay) cost = self.layer3.negative_log_likelihood(self.y) grads = T.grad(cost, self.params) updates = self._vanilla_updates(grads, learning_rate) #----------- FUNCTIONS tp, tn = self.layer3.tptn(self.y) fp, fn = self.layer3.fpfn(self.y) tp_idx = self.layer3.tp_idx(self.y) fp_idx = self.layer3.fp_idx(self.y) train_model = theano.function( [self.x, self.y, Param(self.training_mode, default=1)], cost, updates=updates, on_unused_input='ignore') test_model = theano.function( [self.x, self.y, Param(self.training_mode, default=0)], [tp_idx, fp_idx, tp, tn, fp, fn], on_unused_input='ignore') iter = 0 done_looping = False #------------------------------ TRAINING while not done_looping: for x, y in train_set_iterator: iter += 1 train_model(x, y) learning_rate.set_value( max(learning_rate.get_value() - learning_rate_decay, 0.0)) if iter > opt_iters: done_looping = True break #------------------------------ TESTING self.batch_size.set_value(test_size) [tp_idx, fp_idx, tp, tn, fp, fn] = test_model(test_set_x, test_set_y) seizure_idx = np.flatnonzero(test_set_y) det_dict = detections_and_delay(tp_idx, fp_idx, seizure_idx) print '-- TEST --' print 'tp:', tp, 'tn:', tn, 'fp:', fp, 'fn', fn print 'fp indices:', fp_idx, 'tp indices:', tp_idx print 'seizure indices:', seizure_idx print det_dict json.dump(det_dict, out_file) out_file.write('\n')
#!/usr/bin/env python import numpy import theano from theano import tensor as T from theano import Param from theano import function from theano import shared rng = numpy.random __author__ = 'yanziang' # default value for parameter of function x, y = T.dscalars('x', 'y') z = x + y f = function([x, Param(y, default=1.)], z) print f(33.0) print f(33.0, 2.0) # logistic regression print 'logistic regression' N = 400 feats = 784 D = (rng.randn(N, feats), rng.randint(size=N, low=0, high=2)) training_steps = 10000 # Declare Theano symbolic variables x = T.matrix("x") y = T.vector("y") w = theano.shared(rng.randn(feats), name="w") b = theano.shared(0., name="b") print("Initial model:") print(w.get_value())
def __init__(self, network, optimization="sgd", loss="xent", input_var=None, extra_update_params=[], extra_reg_params=[]): """ optimization selects the type of optimization algorithm used. The default, 'sgd', is standard stochastic gradient descent. Currently, the only alternative is 'adadelta', which implements AdaDelta updates. loss is "xent" or "l2". extra_update_params allows you to specify other parameters that should be updated during training. They must, of course, feature in expression that computes the cost function. Likewise, extra_reg_params allows you to include extra parameters in the L2 regularization term. They should each be a 1D vector. """ self.network = network if input_var is not None: x = input_var else: # Take the network's input as input to the training functions x = network.x # Create a target variable, of the same rank and type as the hidden layer # Special case for where the last layer has just a single unit: don't want y to need to be (M,1), just a vector if self.network.layer_sizes[-1] == 1: y = T.tensor(network.hidden_layer.dtype, (False,), name="y") # For computing the cost, add an extra dimension so the result is (M,1), not (M,) label_for_cost = y.dimshuffle(0, "x") else: y = label_for_cost = T.tensor(network.hidden_layer.dtype, network.hidden_layer.broadcastable, name="y") # Training params self.learning_rate = T.scalar("learning_rate") self.reg_coef = T.scalar("reg_coef") # Needed for AdaDelta self.decay = T.scalar("decay") self.optimization = optimization # Build cost function reg = self.network.get_l2_regularization(extra_params=extra_update_params) cost = self.network.get_cost(label_for_cost, loss=loss) cost_with_reg = cost + self.reg_coef * reg parameters = network.params + extra_update_params # Compute the gradient of the cost wrt the parameters gradients = [T.grad(cost_with_reg, param) for param in parameters] if optimization == "adadelta": # AdaDelta updates, based on Shawn Tan's implementation: # https://blog.wtf.sg/2014/08/28/implementing-adadelta/ extra_params = [Param(self.learning_rate, default=1e-6), Param(self.decay, default=0.95)] # Store intermediate updates gradients_sq = [theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters] deltas_sq = [theano.shared(numpy.zeros(p.get_value().shape)) for p in parameters] # Calculates the new "average" delta for the next iteration gradients_sq_new = [self.decay*g_sq + (1-self.decay)*(g**2) for g_sq, g in izip(gradients_sq, gradients)] # Calculates the step in direction # The square root is an approximation to getting the RMS for the average value deltas = [(T.sqrt(d_sq+self.learning_rate)/T.sqrt(g_sq+self.learning_rate))*grad for d_sq, g_sq, grad in izip(deltas_sq, gradients_sq_new, gradients)] # calculates the new "average" deltas for the next step. deltas_sq_new = [self.decay*d_sq + (1-self.decay)*(d**2) for d_sq, d in izip(deltas_sq, deltas)] # Prepare the updates list updates = ( # Update the squared gradients zip(gradients_sq, gradients_sq_new) + # Update the squared deltas zip(deltas_sq,deltas_sq_new) + # Update the model's actual parameters [(param, param - delta) for (param, delta) in izip(parameters, deltas)] ) else: # Standard SGD updates extra_params = [Param(self.learning_rate, default=0.1)] updates = [(param, param - self.learning_rate * grad) for (param, grad) in zip(parameters, gradients)] # Compile self._train_fn = theano.function( inputs=[ x, y, Param(self.reg_coef, default=0.01), ] + extra_params, outputs=T.mean(cost), updates=updates, ) self._cost_fn = theano.function( inputs=[ x, y, ], outputs=cost, )
import theano.tensor as T from theano import function from theano import Param x, y = T.scalars('x', 'y') z = x + y f = function([x, Param(y, default=1)], z) print f(33) print f(33, 2) w = T.scalar('w') z_two = (x + y) * w f_two = function( [x, Param(y, default=1), Param(w, default=2, name='w_by_name')], z_two) print("Second function") print f_two(33) print f_two(33, 2) print f_two(33, 0, 1) print f_two(33, w_by_name=1) print f_two(33, w_by_name=1, y=0)
def __init__(self, batch_size= 8): self.batch_size = batch_size self.num_updates = 0 layers = [] X = T.matrix().reshape((batch_size,1,225,225)) Y = T.ivector() is_train = T.scalar() inputs = InputLayer(X,name="input") layers.append(inputs) convlayer0 = ConvLayer(inputs,(64,1,15,15), subsample=(3,3) ,name="conv 0") relu0 = RELU(convlayer0, name="relu 0") pool0 = PoolLayer(relu0, pool_size=(3,3), stride=(2,2), name="pool 0") layers += [convlayer0,relu0,pool0] convlayer1 = ConvLayer(pool0,(128,64,5,5), name="conv 1") relu1 = RELU(convlayer1, name="relu 1") pool1 = PoolLayer(relu1, pool_size=(3,3), stride=(2,2), name="pool 1") layers += [convlayer1,relu1,pool1] convlayer2 = ConvLayer(pool1,(256,128,3,3),border_mode=(1,1), name="conv 2") relu2 = RELU(convlayer2, name="relu 2") layers += [convlayer2,relu2] convlayer3 = ConvLayer(relu2,(256,256,3,3),border_mode=(1,1) , name="conv 3") relu3 = RELU(convlayer3, name="relu 3") layers += [convlayer3,relu3] convlayer4 = ConvLayer(relu3,(256,256,3,3),border_mode=(1,1), name="conv 4") relu4 = RELU(convlayer4, name="relu 4") pool2 = PoolLayer(relu4, pool_size=(3,3), stride=(2,2), name="pool 2") drop0 = DropoutLayer(pool2, is_train, p = 0.5, name="Drop 0, p = 0.5") layers += [convlayer4,relu4,pool2,drop0] convlayer5 = ConvLayer(drop0,(512,256,7,7),name="conv 5") relu5 = RELU(convlayer5,name="relu 5") squeezed = Squeeze(relu5,outdim=2, name="squeeze 1") drop1 = DropoutLayer(squeezed, is_train, p = 0.5, name="Drop 1, p = 0.5") layers += [convlayer5, relu5, squeezed, drop1] # layers += [convlayer5, relu5, squeezed] fullyconn1 = FCLayer(drop1,512,512,name="FC 1") # fullyconn1 = FCLayer(squeezed,512,512,name="FC 1") relu6 = RELU(fullyconn1,name="relu 6") drop2 = DropoutLayer(relu6, is_train, p = 0.5, name="Drop 2, p = 0.5") #fullyconn2 = FCLayer(fullyconn1,512,250,name="FC 2") fullyconn2 = FCLayer(drop2,512,250,name="FC 2") softmax1 = SoftmaxLayer(fullyconn2, name="softmax") # layers += [fullyconn1,fullyconn2,softmax1] layers += [fullyconn1,drop2, relu6, fullyconn2,softmax1] predicted_class = T.argmax(softmax1.output(), axis=1) cost = CategoricalCrossEntropy(softmax1,Y).output() self.layers = layers params = get_params(self.layers) biases = get_biases(self.layers) caches_params = make_caches(params) caches_bias = make_caches(biases) eta = T.scalar() updates = momentum(cost, params, biases, caches_params,caches_bias, eta) self.train = theano.function([X,Y,eta, Param(is_train,1)],[cost,predicted_class],updates=updates,allow_input_downcast=True,on_unused_input='warn') self.predict = theano.function([X,Param(is_train,0)],predicted_class,allow_input_downcast=True,on_unused_input='warn') self.predict_with_drop = theano.function([X,Param(is_train,1)],predicted_class,allow_input_downcast=True, on_unused_input='warn') self.predict_prob = theano.function([X,Param(is_train,0)],softmax1.output(), allow_input_downcast=True,on_unused_input='warn') self.validate = theano.function([X,Y,Param(is_train,0)],cost,allow_input_downcast=True, on_unused_input='warn') self.get_embeddings = theano.function([X,Param(is_train,0)],fullyconn1.output(), allow_input_downcast=True, on_unused_input='warn')
outputs: 输出参数列表,list或者dict。如果是dict,那么key必须是字符串。 updates: 一组可迭代更新的量 (shared_variable, new_expression)的形式 对其中的shared_variable输入用new_expression表达式更新,而这个形式可以是列表,元组或者有序字典 updates其实也是每次调用function都会执行一次,则所有的shared_variable都会根据new_expression更新一次值。 givens: 里面存放的是可迭代量,可以是列表,元组或者字典。每次调用function,givens的量都会迭代变化。 它跟inputs一样也是作为参数传递给outputs的 """ # 默认参数 x,y=T.dscalars('x','y') z=x+y f1=function([x, Param(y,default=1,name='by_name')],z) print(f1(33)) print(f1(33,2)) print(f1(33,by_name=3)) # 共享变量 ''' 为了使GPU调用这些变量时,遇到一次就要调用一次,这样就会花费大量时间在数据存取上,导致使用GPU代码运行很慢,甚至比仅用CPU还慢。 共享变量的类型必须为floatX shared 变量可以作为函数将的可以访问的数据,可以用get_value, set_value两个函数访问和获取值 shared 变量既可以作为符号变量,也可以作为共享变量 '''
def train(self, batch_iterator, iterations=10000, iteration_callback=None, validation_set=None, stopping_iterations=10, log=None, cost_plot_filename=None, training_cost_prop_change_threshold=0.0005, learning_rate=0.1, regularization=0., class_weights_vector=None, corruption_level=0., continuous_corruption=False, loss="xent"): """ Train on data stored in Theano tensors. Uses minibatch training. batch_iterator should be a repeatable iterator producing batches. iteration_callback is called after each iteration with args (iteration, error array). If a validation set (matrix) is given, it is used to compute an error after each iteration and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations iterations without an improvement in validation error. If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time the error is computed on the training set. The algorithm will assume it has converged and stop early if the proportional change between successive training costs drops below training_cost_prop_change_threshold for five iterations in a row. Uses L2 regularization. """ if log is None: log = get_console_logger("Autoencoder train") log.info( "Training params: learning rate=%s, noise ratio=%.1f%% (%s), regularization=%s" % (learning_rate, corruption_level * 100.0, "continuous corruption" if continuous_corruption else "zeroing corruption", regularization)) log.info("Training with SGD") ######## Compile functions # Prepare cost/update functions for training cost, updates = self.network.get_cost_updates( self.learning_rate, self.regularization, class_cost_weights=class_weights_vector, corruption_level=corruption_level, continuous_corruption=continuous_corruption, loss=loss) # Prepare training functions cost_fn = theano.function( inputs=[self.network.x, Param(self.regularization, default=0.0)], outputs=cost, ) train_fn = theano.function( inputs=[ self.network.x, Param(self.learning_rate, default=0.1), Param(self.regularization, default=0.0) ], outputs=cost, updates=updates, ) # Prepare a function to test how close to the identity function the learned mapping is # A lower value indicates that it's generalizing more (though not necessarily better) identity_ratio = T.mean( T.sum(self.network.get_prediction_dist() * (self.network.x > 0), axis=1)) identity_ratio_fn = theano.function(inputs=[self.network.x], outputs=identity_ratio) ########### # Keep a record of costs, so we can plot them val_costs = [] training_costs = [] # Keep a copy of the best weights so far val_cost = 0. best_weights = best_iter = best_val_cost = None if validation_set is not None: best_weights = self.network.get_weights() best_iter = -1 best_val_cost = cost_fn(validation_set) log.info("Computing initial validation scores") f_score, precision, recall, f_score_classes = self.compute_f_scores( validation_set) log.info( "F-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" % (f_score * 100.0, f_score_classes, precision * 100.0, recall * 100.0)) identity_ratio = identity_ratio_fn(validation_set) log.info("Identity ratio = %.4g" % identity_ratio) below_threshold_its = 0 for i in range(iterations): err = 0.0 batch_num = 0 for batch_num, batch in enumerate(batch_iterator): # Shuffle the training data between iterations, as one should with SGD # Just shuffle within batches shuffle = numpy.random.permutation(batch.shape[0]) batch[:] = batch[shuffle] # Update the model with this batch's data err += train_fn(batch, learning_rate=learning_rate, regularization=regularization) training_costs.append(err / batch_num) if validation_set is not None: # Compute the cost function on the validation set val_cost = cost_fn(validation_set) / validation_set.shape[0] val_costs.append(val_cost) if val_cost <= best_val_cost: # We assume that, if the validation error remains the same, it's better to use the new set of # weights (with, presumably, a better training error) if val_cost == best_val_cost: log.info( "Same validation cost: %.4f, using new weights" % val_cost) else: log.info("New best validation cost: %.4f" % val_cost) # Update our best estimate best_weights = self.network.get_weights() best_iter = i best_val_cost = val_cost if val_cost >= best_val_cost and i - best_iter >= stopping_iterations: # We've gone on long enough without improving validation error # Time to call a halt and use the best validation error we got log.info( "Stopping after %d iterations of increasing validation cost" % stopping_iterations) break log.info( "COMPLETED ITERATION %d: training cost=%.5g, val cost=%.5g" % (i, training_costs[-1], val_cost)) if cost_plot_filename: # Plot the cost function as we train # Skip the first costs, as they're usually so much higher than others that the rest is indistinguishable columns = [(training_costs[1:], "Train cost")] if validation_set is not None: columns.append((val_costs[1:], "Val cost")) ax = plot_costs(None, *columns) # Add a line at the most recent best val cost ax.axvline(float(best_iter), color="b") ax.text(float(best_iter + 1) + 0.1, best_val_cost * 1.1, "Best val cost", color="b") from matplotlib import pyplot as plt plt.savefig(cost_plot_filename) if validation_set is not None: f_score, precision, recall, f_score_classes = self.compute_f_scores( validation_set) log.info( "Validation f-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" % (f_score * 100.0, f_score_classes, precision * 100.0, recall * 100.0)) identity_ratio = identity_ratio_fn(validation_set) log.info("Validation identity ratio = %.4g" % identity_ratio) if iteration_callback is not None: # Not computing training error at the moment iteration_callback(i, training_costs[-1], val_cost, 0.0, best_iter) # Check the proportional change between this iteration's training cost and the last if len(training_costs) > 2: training_cost_prop_change = abs( (training_costs[-2] - training_costs[-1]) / training_costs[-2]) if training_cost_prop_change < training_cost_prop_change_threshold: # Very small change in training cost - maybe we've converged below_threshold_its += 1 if below_threshold_its >= 5: # We've had enough iterations with very small changes: we've converged log.info( "Proportional change in training cost (%g) below %g for five successive iterations: " "converged" % (training_cost_prop_change, training_cost_prop_change_threshold)) break else: log.info( "Proportional change in training cost (%g) below %g for %d successive iterations: " "waiting until it's been low for five iterations" % (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its)) else: # Reset the below threshold counter below_threshold_its = 0 if best_weights is not None: # Use the weights that gave us the best error on the validation set self.network.set_weights(best_weights)
def train(self, batch_iterator, iterations=10000, iteration_callback=None, validation_set=None, stopping_iterations=10, log=None, cost_plot_filename=None, training_cost_prop_change_threshold=0.0005, learning_rate=0.1, regularization=0., class_weights_vector=None, corruption_level=0., continuous_corruption=False, loss="xent"): """ See autoencoder trainer: uses the same training for each layer in turn, then rolls out and trains the whole thing together. """ if log is None: log = get_console_logger("Autoencoder train") # Because the layers are all already properly stacked, when we get the cost/updates for a layer, # it's already a function of the original input, but only updates the layer itself for layer_num, layer in enumerate(self.network.layers): log.info("TRAINING LAYER %d" % layer_num) ## Compile functions # Prepare cost/update functions for training cost, updates = layer.get_cost_updates( self.learning_rate, self.regularization, class_cost_weights=class_weights_vector, corruption_level=corruption_level, continuous_corruption=continuous_corruption, loss=loss) # Prepare training functions # Note that these use the initial input, not the layer input cost_fn = theano.function( inputs=[self.input, Param(self.regularization, default=0.0)], outputs=cost, ) train_fn = theano.function( inputs=[ self.input, Param(self.learning_rate, default=0.1), Param(self.regularization, default=0.0) ], outputs=cost, updates=updates, ) # Prepare a function to test how close to the identity function the learned mapping is # A lower value indicates that it's generalizing more (though not necessarily better) identity_ratio = T.mean( T.sum(layer.get_prediction_dist() * (layer.x > 0), axis=1)) identity_ratio_fn = theano.function(inputs=[self.input], outputs=identity_ratio) # Keep a record of costs, so we can plot them val_costs = [] training_costs = [] # Keep a copy of the best weights so far val_cost = 0. best_weights = best_iter = best_val_cost = None if validation_set is not None: best_weights = layer.get_weights() best_iter = -1 best_val_cost = cost_fn(validation_set) log.info("Computing initial validation scores") identity_ratio = identity_ratio_fn(validation_set) log.info("Identity ratio = %.4g" % identity_ratio) log.info("Computing initial training cost") batch_costs = [cost_fn(batch) for batch in batch_iterator] initial_cost = sum(batch_costs) / len(batch_costs) log.info("Cost = %g (%d batches)" % (initial_cost, len(batch_costs))) below_threshold_its = 0 for i in range(iterations): err = 0.0 batch_num = 0 for batch_num, batch in enumerate(batch_iterator): # Shuffle the training data between iterations, as one should with SGD # Just shuffle within batches shuffle = numpy.random.permutation(batch.shape[0]) batch[:] = batch[shuffle] # Update the model with this batch's data err += train_fn(batch, learning_rate=learning_rate, regularization=regularization) training_costs.append(err / batch_num) if validation_set is not None: # Compute the cost function on the validation set val_cost = cost_fn( validation_set) / validation_set.shape[0] val_costs.append(val_cost) if val_cost <= best_val_cost: # We assume that, if the validation error remains the same, it's better to use the new set of # weights (with, presumably, a better training error) if val_cost == best_val_cost: log.info( "Same validation cost: %.4f, using new weights" % val_cost) else: log.info("New best validation cost: %.4f" % val_cost) # Update our best estimate best_weights = layer.get_weights() best_iter = i best_val_cost = val_cost if val_cost >= best_val_cost and i - best_iter >= stopping_iterations: # We've gone on long enough without improving validation error # Time to call a halt and use the best validation error we got log.info( "Stopping after %d iterations of increasing validation cost" % stopping_iterations) break log.info( "COMPLETED ITERATION %d: training cost=%.5g, val cost=%.5g" % (i, training_costs[-1], val_cost)) else: log.info("COMPLETED ITERATION %d: training cost=%.5g" % (i, training_costs[-1])) if cost_plot_filename: # Plot the cost function as we train # Skip the first costs, as they're usually so much higher that the rest is indistinguishable columns = [(training_costs[1:], "Train cost")] if validation_set is not None: columns.append((val_costs[1:], "Val cost")) ax = plot_costs(None, *columns) # Add a line at the most recent best val cost ax.axvline(float(best_iter), color="b") ax.text(float(best_iter + 1) + 0.1, best_val_cost * 1.1, "Best val cost", color="b") from matplotlib import pyplot as plt plt.savefig(cost_plot_filename) if validation_set is not None: identity_ratio = identity_ratio_fn(validation_set) log.info("Validation identity ratio = %.4g" % identity_ratio) if iteration_callback is not None: # Not computing training error at the moment iteration_callback(i, training_costs[-1], val_cost, 0.0, best_iter) # Check the proportional change between this iteration's training cost and the last if len(training_costs) > 2: training_cost_prop_change = abs( (training_costs[-2] - training_costs[-1]) / training_costs[-2]) if training_cost_prop_change < training_cost_prop_change_threshold: # Very small change in training cost - maybe we've converged below_threshold_its += 1 if below_threshold_its >= 5: # We've had enough iterations with very small changes: we've converged log.info( "Proportional change in training cost (%g) below %g for five successive iterations: " "converged" % (training_cost_prop_change, training_cost_prop_change_threshold)) break else: log.info( "Proportional change in training cost (%g) below %g for %d successive iterations: " "waiting until it's been low for five iterations" % (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its)) else: # Reset the below threshold counter below_threshold_its = 0 if best_weights is not None: # Use the weights that gave us the best error on the validation set layer.set_weights(best_weights)
def __init__(self, num_features, num_classes, num_hidden_units=100, normalize_features=False, autoencoder=False, hidden_activation_fn=None, initialization='glorot'): self.num_features = num_features self.num_classes = num_classes self.num_hidden_units = num_hidden_units self.normalize_features = normalize_features if autoencoder: raise NotImplementedError( "don't use SingleLayerNetwork any more to train an autoencoder. It has " "its own implementation, which is better") if hidden_activation_fn is None: hidden_activation_fn = nnet.sigmoid # Set up Theano network for the model # Features: (m, num_features) self.x = T.matrix("x", dtype="float64") # Classes (one-hot): (m, num_classes) self.y = T.vector("y", dtype="int64") if normalize_features: # Divide feature vector by its Euclidean norm before using the values as inputs self.inputs = ifelse(T.gt(self.x.sum(), 0), self.x / T.sqrt( (self.x**2).sum()), self.x) else: self.inputs = self.x if initialization == 'gaussian': # Weights and bias, randomly initialized self.w0 = theano.shared(numpy.random.randn(num_features, num_hidden_units), name="w0") self.b0 = theano.shared(numpy.random.randn(num_hidden_units), name="b0") self.w1 = theano.shared(numpy.random.randn(num_hidden_units, num_classes), name="w1") self.b1 = theano.shared(numpy.random.randn(num_classes), name="b1") elif initialization == 'glorot': # Use Glorot & Bengio's initialization scheme, where the range of random weights depends on the number of # hidden units in this and the previous layer unif_width0 = math.sqrt(6.) / math.sqrt(num_features + num_hidden_units) self.w0 = theano.shared(numpy.random.uniform( -unif_width0, unif_width0, (num_features, num_hidden_units)), name="w0") unif_width1 = math.sqrt(6.) / math.sqrt(num_hidden_units + num_classes) self.w1 = theano.shared(numpy.random.uniform( -unif_width1, unif_width1, (num_hidden_units, num_classes)), name="w1") # Initialize biases to 0 self.b0 = theano.shared(numpy.zeros(num_hidden_units), name="b0") self.b1 = theano.shared(numpy.zeros(num_classes), name="b1") elif initialization == 'squashed-gaussian': # Similar to initializing with a normalized Gaussian, but squashes to std to 1/sqrt(input_nodes) std0 = 1. / math.sqrt(num_features) self.w0 = theano.shared(numpy.random.normal( 0., std0, (num_features, num_hidden_units)), name="w0") std1 = 1. / math.sqrt(num_hidden_units) self.w1 = theano.shared(numpy.random.uniform( 0., std1, (num_hidden_units, num_classes)), name="w1") # Initialize biases to 0 self.b0 = theano.shared(numpy.zeros(num_hidden_units), name="b0") self.b1 = theano.shared(numpy.zeros(num_classes), name="b1") else: raise ValueError( "unknown initialization type '%s'. Choose gaussian, squashed-gaussian or glorot" % initialization) # Parameter self.output_bias = T.scalar("output_bias", dtype="int64") self.hidden_bias = T.scalar("hidden_bias", dtype="int64") # Construct Theano expression graph self.hidden_activation = T.dot(self.inputs, self.w0) + \ ifelse(T.gt(self.hidden_bias, 0), self.b0, T.zeros(self.b0.shape)) self.output_activation = T.dot(hidden_activation_fn(self.hidden_activation), self.w1) + \ ifelse(T.gt(self.output_bias, 0), self.b1, T.zeros(self.b1.shape)) # Softmax activations to get a probability distribution over the classes self.class_probs = nnet.softmax(self.output_activation) # The predicted class is that with highest activation (no need to do the softmax for this) self.prediction = T.argmax(self.output_activation, axis=1) error = T.mean(T.neq(self.prediction, self.y)) # Compile self._predict_fn = theano.function(inputs=[ self.x, Param(self.output_bias, default=1), Param(self.hidden_bias, default=1) ], outputs=self.prediction) self._prob_fn = theano.function(inputs=[ self.x, Param(self.output_bias, default=1), Param(self.hidden_bias, default=1) ], outputs=self.class_probs) self._error_fn = theano.function( inputs=[ self.x, self.y, Param(self.output_bias, default=1), Param(self.hidden_bias, default=1) ], outputs=error, ) self.hidden_fn = theano.function( inputs=[self.x, Param(self.hidden_bias, default=1)], outputs=hidden_activation_fn(self.hidden_activation), ) one_hot_predictions = T.eye(num_classes, num_classes, dtype="int8")[self.prediction] one_hot_targets = T.eye(num_classes, num_classes, dtype="int8")[self.y] # Average log-prob of correct answer # More useful metric than accuracy, since we can see differences even where the right answer's not top mean_target_log_prob = T.mean( T.log(self.class_probs[T.arange(self.y.shape[0]), self.y])) self.mean_log_prob = theano.function(inputs=[ self.x, self.y, Param(self.output_bias, default=1), Param(self.hidden_bias, default=1) ], outputs=mean_target_log_prob) # Similar thing, but averaged within classes first, then across num_targets = one_hot_targets.sum(axis=0) has_targets = T.neq(num_targets, 0.) per_class_target_log_prob = T.switch( has_targets, T.sum(T.log(self.class_probs) * one_hot_targets, axis=0) / num_targets, 0.) self._mean_per_class_target_log_prob = T.sum( per_class_target_log_prob) / T.sum(has_targets) self.mean_per_class_target_log_prob = theano.function( inputs=[ self.x, self.y, Param(self.output_bias, default=1), Param(self.hidden_bias, default=1) ], outputs=self._mean_per_class_target_log_prob) ##### F-score computation # True positives per output class true_pos = T.cast(T.sum(one_hot_predictions & one_hot_targets, axis=0), dtype="float64") # Positive targets per output class pos = T.cast(T.sum(one_hot_targets, axis=0), dtype="float64") # Predicted positives per output class predicted_pos = T.cast(T.sum(one_hot_predictions, axis=0), dtype="float64") # If pos==0 (no actual positives) recall is undefined # Simple way out of div zero: wherever pos==0, setting pos=1 is fine (since recall==1) recalls = T.switch(T.eq(pos, 0), float('nan'), true_pos) / T.switch( T.eq(pos, 0), 1., pos) # Simple way out of div zero: wherever predicted_pos==0 we're setting num directly, so 1 denom is fine precisions = T.switch( T.eq(predicted_pos, 0) & T.eq(pos, 0), float('nan'), # Don't penalize precision if there are no positives true_pos / T.switch(T.eq(predicted_pos, 0), 1., predicted_pos)) f_scores = T.switch( T.isnan(precisions) | T.isnan(recalls), float('nan'), 2. * precisions * recalls / T.switch(precisions + recalls > 0, precisions + recalls, 1.), ) self._precisions_fn = theano.function( inputs=[self.x, self.y], outputs=precisions, givens=[(self.output_bias, 1), (self.hidden_bias, 1)], ) self._recalls_fn = theano.function( inputs=[self.x, self.y], outputs=recalls, givens=[(self.output_bias, 1), (self.hidden_bias, 1)], ) self._f_score_fn = theano.function( inputs=[self.x, self.y], outputs=[f_scores, precisions, recalls], givens=[(self.output_bias, 1), (self.hidden_bias, 1)], )
def __init__(self, sequence_length=5): self.sequence_length = sequence_length self.num_updates = 0 layers = [] X = T.matrix().reshape((sequence_length, 512)) Y = T.ivector() is_train = T.scalar() inputs = InputLayer(X, name="input") layers.append(inputs) #drop1 = DropoutLayer(inputs,is_train, p = 0.5, name="Drop 1, p = 0.5") lstm1 = LSTMLayer(inputs, 512, 512, name="LSTM 1", return_sequences=False) #lstm2 = LSTMLayer(drop1,512,512,name="LSTM 2",return_sequences=True) #drop2 = DropoutLayer(lstm1,is_train, p = 0.5, name="Drop 2, p = 0.5") #lstm3 = LSTMLayer(drop2,512,250,name="LSTM 3",return_sequences=False) softmax1 = SoftmaxLayer(lstm1, name="softmax") layers += [lstm1, softmax1] # layers += [lstm1, drop1, drop2, lstm3, softmax1] #layers += [lstm1, drop1, lstm2, drop2, lstm3, softmax1] predicted_class = T.argmax(softmax1.output()) cost = CategoricalCrossEntropy(softmax1, Y).output() self.layers = layers params = get_params(self.layers) biases = get_biases(self.layers) caches_params = make_caches(params) caches_bias = make_caches(biases) eta = T.scalar() updates = momentum(cost, params, biases, caches_params, caches_bias, eta) self.train = theano.function([X, Y, eta, Param(is_train, 1)], [cost, predicted_class], updates=updates, allow_input_downcast=True, on_unused_input='warn') self.predict = theano.function([X, Param(is_train, 0)], predicted_class, allow_input_downcast=True, on_unused_input='warn') self.predict_with_drop = theano.function([X, Param(is_train, 1)], predicted_class, allow_input_downcast=True, on_unused_input='warn') self.validate = theano.function([X, Y, Param(is_train, 0)], cost, allow_input_downcast=True, on_unused_input='warn')
''' Executing multiple functions ''' a, b = T.dmatrices('a', 'b') diff = a - b abs_diff = abs(a - b) diff_sq = diff**2 mult = function([a, b], [diff, abs_diff, diff_sq]) print mult([[0, 1], [1, 2]], [[-1, 2], [5, 7]]) #print pp(diff) #print pp(abs_diff) ''' Setting a default value for an argument So, if arg not give, take default value; else take the given value ''' x, y = T.dscalars("x", "y") z = x + y add = function([x, Param(y, default=1)], z) print add(33.0) print add(2, 6) ''' Setting names to parameters ''' x, y, w = T.dscalars("x", "y", "w") z = (x + y) * w add_par = function( [x, Param(y, default=1), Param(w, default=2, name="debalu")], z) print add_par(33) print add_par(33, 6, debalu=5)
def train(self, xs, ys, iterations=10000, iteration_callback=None, validation_xs=None, validation_ys=None, validation_frequency=1, learning_rate=0.1, regularization=0.01, plot_errors=None, plot_cost=None): """ Train on data stored in Theano tensors. E.g. xs = rng.randn(N, num_features) ys = rng.randint(size=N, low=0, high=2) iteration_callback is called after each iteration with args (iteration, error array). """ learning_rate_var = T.scalar("alpha") # Compute the training function _train_fn = theano.function( inputs=[ self.x, self.y, Param(learning_rate_var, default=0.1), Param(self.reg_coef, default=0.01) ], outputs=self._cost_without_reg, updates=[(self.theta, self.theta - learning_rate_var * self.gtheta) ], ) best_validation_error = numpy.inf validation_errors = [] training_errors = [] costs = [] for i in range(iterations): training_cost = _train_fn(xs, ys, alpha=learning_rate, reg=regularization) # Only evaluate on val set every validation_frequencyth iteration if validation_xs is not None and (i + 1) % validation_frequency == 0: # Compute accuracy on validation set validation_error = self.error(validation_xs, validation_ys) # Compute accuracy on training set training_error = self.error(xs, ys) # Compute how much we've improved on the previous best validation error if validation_error < best_validation_error: validation_improvement = 0.0 else: validation_improvement = ( validation_error - best_validation_error) / best_validation_error * 100.0 best_validation_error = validation_error else: validation_error = None validation_improvement = None training_error = None if iteration_callback is not None: iteration_callback(i, training_cost, training_error, validation_error, validation_improvement) # Plot some graphs if plot_cost: costs.append(training_cost) plot_costs(plot_cost, (costs, "training cost")) if plot_errors and validation_error is not None: validation_errors.append(validation_error) training_errors.append(training_error) plot_costs(plot_errors, (training_errors, "training set error"), (validation_errors, "val set error"))
import theano import theano.tensor as T from theano import Param from theano.tensor.shared_randomstreams import RandomStreams x = T.dmatrix('x') s = 1 / (1 + T.exp(-x)) logistic = theano.function([x], s) y = logistic([[0, 1], [-1, -2]]) #print y x, y, w = T.dscalars('x', 'y', 'w') z = (x + y) * w f = theano.function([x, Param(y, default=1), Param(w, default=2)], z) print f(9, w = 1, y = 2) srng = RandomStreams(seed = 234) rv_u = srng.uniform((2,2)) rv_n = srng.normal((2,2)) f = theano.function([], rv_u) z = theano.function([], rv_u + rv_u - 2 * rv_u) print z print z() print rv_u print rv_n
def train(self, xs, iterations=10000, iteration_callback=None, batch_size=20, batch_callback=None, validation_set=None, stopping_iterations=10, log=None, cost_plot_filename=None, training_cost_prop_change_threshold=0.0005, learning_rate=0.1, regularization=None, class_weights=None, corruption_level=0., continuous_corruption=False, loss="xent"): """ Train on data stored in Theano tensors. Uses minibatch training. xs are the vectors to train on. Targets needn't be given, since the input and output are the same in an autoencoder. iteration_callback is called after each iteration with args (iteration, error array). If a validation set (matrix) is given, it is used to compute an error after each iteration and to enforce a stopping criterion. The algorithm will terminate if it goes stopping_iterations iterations without an improvement in validation error. If compute_error_frequency > 1 (default=5), this number of iterations are performed between each time the error is computed on the training set. The algorithm will assume it has converged and stop early if the proportional change between successive training costs drops below training_cost_prop_change_threshold for five iterations in a row. Uses L2 regularization. Several params are included just to implement the same interface as single_hidden_layer. Might want to change this later to be a bit neater. """ if log is None: log = get_console_logger("Autoencoder train") log.info( "Training params: learning rate=%s, noise ratio=%.1f%% (%s), regularization=%.2f" % (learning_rate, self.network.corruption_level * 100.0, "continuous corruption" if self.network.continuous_corruption else "zeroing corruption", regularization)) log.info("Training with SGD, batch size=%d" % batch_size) if class_weights is None: # Don't apply any weighting class_weights_vector = None elif class_weights == "freq": # Apply inverse frequency weighting class_counts = numpy.maximum(xs.sum(axis=0), 1.0) class_weights_vector = 1. / class_counts class_weights_vector *= xs.shape[1] / class_weights_vector.sum() log.info( "Using inverse frequency class weighting in cost function") elif class_weights == "log": class_counts = numpy.maximum(xs.sum(axis=0), 1.0) class_weights_vector = 1. / (numpy.log(class_counts) + 1.) class_weights_vector *= xs.shape[1] / class_weights_vector.sum() log.info( "Using inverse log frequency class weighting in cost function") else: raise ValueError("invalid class weighting '%s'" % class_weights) ######## Compile functions # Prepare cost/update functions for training cost, updates = self.network.get_cost_updates( self.learning_rate, self.regularization, class_cost_weights=class_weights_vector, corruption_level=corruption_level, continuous_corruption=continuous_corruption, loss=loss) # Prepare training functions cost_fn = theano.function( inputs=[self.network.x, Param(self.regularization, default=0.0)], outputs=cost, ) train_fn = theano.function( inputs=[ self.network.x, Param(self.learning_rate, default=0.1), Param(self.regularization, default=0.0) ], outputs=cost, updates=updates, ) # Prepare a function to test how close to the identity function the learned mapping is # A lower value indicates that it's generalizing more (though not necessarily better) identity_ratio = T.mean( T.sum(self.network.get_prediction_dist() * (self.network.x > 0), axis=1)) identity_ratio_fn = theano.function(inputs=[self.network.x], outputs=identity_ratio) ########### # Throw away ys in validation set validation_set = validation_set[0] # Prepare a prediction validation set by holding one event out of every chain in the val set prediction_targets = numpy.array([ random.choice(numpy.where(x_row > 0)[0]) for x_row in validation_set ], dtype=numpy.int16) prediction_contexts = validation_set.copy() prediction_contexts[range(prediction_contexts.shape[0]), prediction_targets] = 0. prediction_balanced_sample = balanced_array_sample(prediction_targets, balance_ratio=4., min_inclusion=1) prediction_targets = prediction_targets[prediction_balanced_sample] prediction_contexts = prediction_contexts[prediction_balanced_sample] log.info( "Prepared roughly balanced prediction set from validation set with %d examples" % prediction_contexts.shape[0]) # Work out how many batches to do if batch_size is None or batch_size == 0: num_batches = 1 else: num_batches = xs.shape[0] / batch_size if xs.shape[0] % batch_size != 0: num_batches += 1 # Keep a record of costs, so we can plot them val_costs = [] training_costs = [] # Compute costs using the initialized network training_cost = cost_fn(xs) training_costs.append(training_cost) if validation_set is not None: val_cost = cost_fn(validation_set) val_costs.append(val_cost) else: val_cost = None log.info("Computing initial validation scores") f_score, precision, recall, f_score_classes = self.compute_f_scores( validation_set) log.info("F-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" % (f_score * 100.0, f_score_classes, precision * 100.0, recall * 100.0)) log_prob = self.network.prediction_log_prob(prediction_contexts, prediction_targets) log.info("Logprob = %.4g" % log_prob) gen_log_prob = self.network.generalization_log_prob( prediction_contexts, prediction_targets) log.info("Generalization logprob = %.4g" % gen_log_prob) identity_ratio = identity_ratio_fn(validation_set) log.info("Identity ratio = %.4g" % identity_ratio) # Keep a copy of the best weights so far best_weights = best_iter = best_val_cost = None if validation_set is not None: best_weights = self.network.get_weights() best_iter = -1 best_val_cost = val_cost below_threshold_its = 0 for i in range(iterations): # Shuffle the training data between iterations, as one should with SGD shuffle = numpy.random.permutation(xs.shape[0]) xs[:] = xs[shuffle] err = 0.0 if num_batches > 1: for batch in range(num_batches): # Update the model with this batch's data batch_err = train_fn(xs[batch * batch_size:(batch + 1) * batch_size], learning_rate=learning_rate, regularization=regularization) err += batch_err if batch_callback is not None: batch_callback(batch, num_batches, batch_err) else: # Batch training: no need to loop ### Always perform one batch iteration to start with to get us into a good part of the space train_fn(xs, learning_rate=learning_rate, regularization=regularization) # Go back and compute training cost training_cost = cost_fn(xs) training_costs.append(training_cost) if validation_set is not None: # Compute the cost function on the validation set val_cost = cost_fn(validation_set) val_costs.append(val_cost) if val_cost <= best_val_cost: # We assume that, if the validation error remains the same, it's better to use the new set of # weights (with, presumably, a better training error) if val_cost == best_val_cost: log.info( "Same validation cost: %.4f, using new weights" % val_cost) else: log.info("New best validation cost: %.4f" % val_cost) # Update our best estimate best_weights = self.network.get_weights() best_iter = i best_val_cost = val_cost if val_cost >= best_val_cost and i - best_iter >= stopping_iterations: # We've gone on long enough without improving validation error # Time to call a halt and use the best validation error we got log.info( "Stopping after %d iterations of increasing validation cost" % stopping_iterations) break log.info( "COMPLETED ITERATION %d: training cost=%.5f, val cost=%.5f" % (i, training_cost, val_cost)) if cost_plot_filename: # Plot the cost function as we train # Skip the first costs, as they're usually so much higher than others that the rest is indistinguishable columns = [(training_costs[1:], "Train cost")] if validation_set is not None: columns.append((val_costs[1:], "Val cost")) ax = plot_costs(None, *columns) # Add a line at the most recent best val cost ax.axvline(float(best_iter), color="b") ax.text(float(best_iter + 1) + 0.1, best_val_cost * 1.1, "Best val cost", color="b") plt.savefig(cost_plot_filename) f_score, precision, recall, f_score_classes = self.compute_f_scores( validation_set) log.info( "Validation f-score: %.4f%% (mean over %d classes), P=%.4f%%, R=%.4f%%" % (f_score * 100.0, f_score_classes, precision * 100.0, recall * 100.0)) #log_prob = self.network.prediction_log_prob(prediction_contexts, prediction_targets) #log.info("Prediction logprob = %.4g" % log_prob) gen_log_prob = self.network.generalization_log_prob( prediction_contexts, prediction_targets) log.info("Generalization logprob = %.4g" % gen_log_prob) identity_ratio = identity_ratio_fn(validation_set) log.info("Validation identity ratio = %.4g" % identity_ratio) if iteration_callback is not None: # Not computing training error at the moment iteration_callback(i, training_cost, val_cost, 0.0, best_iter) # Check the proportional change between this iteration's training cost and the last if len(training_costs) > 2: training_cost_prop_change = abs( (training_costs[-2] - training_costs[-1]) / training_costs[-2]) if training_cost_prop_change < training_cost_prop_change_threshold: # Very small change in training cost - maybe we've converged below_threshold_its += 1 if below_threshold_its >= 5: # We've had enough iterations with very small changes: we've converged log.info( "Proportional change in training cost (%g) below %g for five successive iterations: " "converged" % (training_cost_prop_change, training_cost_prop_change_threshold)) break else: log.info( "Proportional change in training cost (%g) below %g for %d successive iterations: " "waiting until it's been low for five iterations" % (training_cost_prop_change, training_cost_prop_change_threshold, below_threshold_its)) else: # Reset the below threshold counter below_threshold_its = 0 if best_weights is not None: # Use the weights that gave us the best error on the validation set self.network.set_weights(best_weights)