class Leaf: """ special fields + params : white list of params to optimize + updates : white list of updates to optimize """ is_train = tt.bscalar() def __call__(self, *args, **kwargs): raise NotImplementedError("implement in a derived class") def get_params(self): if hasattr(self, "params"): return search_shared(self.params) return search_shared(getattr(self, s) for s in dir(self)) def get_updates(self): return search_updates(self) def optimize(self, cost: tt.Variable, optimizer: Optimizer): updates = optimizer.updates(self.get_params(), cost) updates.update(self.get_updates()) return updates def state_list(self): return list(p.get_value() for p in self.get_params()) def load_state_list(self, state): for p, s in zip(self.get_params(), state): p.set_value(s)
def __init__(self, data, config, fast_predict=False): self.embedding_shapes = data.embedding_shapes self.lstm_type = config.lstm_cell self.lstm_hidden_size = int(config.lstm_hidden_size) self.num_lstm_layers = int(config.num_lstm_layers) self.max_grad_norm = float(config.max_grad_norm) self.vocab_size = data.word_dict.size() self.label_space_size = data.label_dict.size() self.unk_id = data.unk_id # Initialize layers and parameters self.embedding_layer = EmbeddingLayer(data.embedding_shapes, data.embeddings) self.params = [p for p in self.embedding_layer.params] self.rnn_layers = [None] * self.num_lstm_layers for l in range(self.num_lstm_layers): input_dim = self.embedding_layer.output_size if l == 0 else self.lstm_hidden_size input_dropout = config.input_dropout_prob if ( config.per_layer_dropout or l == 0) else 0.0 recurrent_dropout = config.recurrent_dropout_prob self.rnn_layers[l] = get_rnn_layer(self.lstm_type)( input_dim, self.lstm_hidden_size, input_dropout_prob=input_dropout, recurrent_dropout_prob=recurrent_dropout, fast_predict=fast_predict, prefix='lstm_{}'.format(l)) self.params.extend(self.rnn_layers[l].params) self.softmax_layer = SoftmaxLayer(self.lstm_hidden_size, self.label_space_size) self.params.extend(self.softmax_layer.params) # Build model # Shape of x: [seq_len, batch_size, num_features] self.x0 = tensor.ltensor3('x') self.y0 = tensor.lmatrix('y') self.mask0 = tensor.matrix('mask', dtype=floatX) self.is_train = tensor.bscalar('is_train') self.x = self.x0.dimshuffle(1, 0, 2) self.y = self.y0.dimshuffle(1, 0) self.mask = self.mask0.dimshuffle(1, 0) self.inputs = [None] * (self.num_lstm_layers + 1) self.inputs[0] = self.embedding_layer.connect(self.x) self.rev_mask = self.mask[::-1] for l, rnn in enumerate(self.rnn_layers): outputs = rnn.connect(self.inputs[l], self.mask if l % 2 == 0 else self.rev_mask, self.is_train) self.inputs[l + 1] = outputs[::-1] self.scores, self.pred = self.softmax_layer.connect(self.inputs[-1]) self.pred0 = self.pred.reshape([self.x.shape[0], self.x.shape[1]]).dimshuffle(1, 0)
def test_replacements(binomial_model_inference): d = tt.bscalar() d.tag.test_value = 1 approx = binomial_model_inference.approx p = approx.model.p p_t = p ** 3 p_s = approx.sample_node(p_t) if theano.config.compute_test_value != 'off': assert p_s.tag.test_value.shape == p_t.tag.test_value.shape sampled = [p_s.eval() for _ in range(100)] assert any(map( operator.ne, sampled[1:], sampled[:-1]) ) # stochastic p_d = approx.sample_node(p_t, deterministic=True) sampled = [p_d.eval() for _ in range(100)] assert all(map( operator.eq, sampled[1:], sampled[:-1]) ) # deterministic p_r = approx.sample_node(p_t, deterministic=d) sampled = [p_r.eval({d: 1}) for _ in range(100)] assert all(map( operator.eq, sampled[1:], sampled[:-1]) ) # deterministic sampled = [p_r.eval({d: 0}) for _ in range(100)] assert any(map( operator.ne, sampled[1:], sampled[:-1]) ) # stochastic
def random_fn(self): """ Implements posterior distribution from initial latent space Parameters ---------- size : number of samples from distribution no_rand : whether use deterministic distribution Returns ------- posterior space (numpy) """ In = theano.In size = tt.iscalar('size') no_rand = tt.bscalar('no_rand') posterior = self.random(size, no_rand) fn = theano.function([ In(size, 'size', 1, allow_downcast=True), In(no_rand, 'no_rand', 0, allow_downcast=True) ], posterior) def inner(size=None, no_rand=False): if size is None: return fn(1, int(no_rand))[0] else: return fn(size, int(no_rand)) return inner
def test_param_allow_downcast_int(self): a = tensor.wvector('a') # int16 b = tensor.bvector('b') # int8 c = tensor.bscalar('c') # int8 f = pfunc([ Param(a, allow_downcast=True), Param(b, allow_downcast=False), Param(c, allow_downcast=None) ], (a + b + c)) # Both values are in range. Since they're not ndarrays (but lists), # they will be converted, and their value checked. assert numpy.all(f([3], [6], 1) == 10) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) self.assertRaises(TypeError, f, [3], numpy.array([6], dtype='int16'), 1) # Value too big for a, silently ignored assert numpy.all(f([2**20], numpy.ones(1, dtype='int8'), 1) == 2) # Value too big for b, raises TypeError self.assertRaises(TypeError, f, [3], [312], 1) # Value too big for c, raises TypeError self.assertRaises(TypeError, f, [3], [6], 806)
def test_allow_input_downcast_int(self): a = tensor.wvector('a') # int16 b = tensor.bvector('b') # int8 c = tensor.bscalar('c') # int8 f = pfunc([a, b, c], (a + b + c), allow_input_downcast=True) # Value too big for a, b, or c, silently ignored assert f([2**20], [1], 0) == 1 assert f([3], [312], 0) == 59 assert f([3], [1], 806) == 42 g = pfunc([a, b, c], (a + b + c), allow_input_downcast=False) # All values are in range. Since they're not ndarrays (but lists # or scalars), they will be converted, and their value checked. assert numpy.all(g([3], [6], 0) == 9) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) self.assertRaises(TypeError, g, [3], numpy.array([6], dtype='int16'), 0) # Value too big for b, raises TypeError self.assertRaises(TypeError, g, [3], [312], 0) h = pfunc([a, b, c], (a + b + c)) # Default: allow_input_downcast=None # Everything here should behave like with False assert numpy.all(h([3], [6], 0) == 9) self.assertRaises(TypeError, h, [3], numpy.array([6], dtype='int16'), 0) self.assertRaises(TypeError, h, [3], [312], 0)
def test_param_allow_downcast_int(self): a = tensor.wvector("a") # int16 b = tensor.bvector("b") # int8 c = tensor.bscalar("c") # int8 f = pfunc( [ In(a, allow_downcast=True), In(b, allow_downcast=False), In(c, allow_downcast=None), ], (a + b + c), ) # Both values are in range. Since they're not ndarrays (but lists), # they will be converted, and their value checked. assert np.all(f([3], [6], 1) == 10) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) with pytest.raises(TypeError): f([3], np.array([6], dtype="int16"), 1) # Value too big for a, silently ignored assert np.all(f([2**20], np.ones(1, dtype="int8"), 1) == 2) # Value too big for b, raises TypeError with pytest.raises(TypeError): f([3], [312], 1) # Value too big for c, raises TypeError with pytest.raises(TypeError): f([3], [6], 806)
def test_param_allow_downcast_int(self): a = tensor.wvector('a') # int16 b = tensor.bvector('b') # int8 c = tensor.bscalar('c') # int8 f = pfunc([Param(a, allow_downcast=True), Param(b, allow_downcast=False), Param(c, allow_downcast=None)], (a + b + c)) # Both values are in range. Since they're not ndarrays (but lists), # they will be converted, and their value checked. assert numpy.all(f([3], [6], 1) == 10) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) self.assertRaises(TypeError, f, [3], numpy.array([6], dtype='int16'), 1) # Value too big for a, silently ignored assert numpy.all(f([2 ** 20], numpy.ones(1, dtype='int8'), 1) == 2) # Value too big for b, raises TypeError self.assertRaises(TypeError, f, [3], [312], 1) # Value too big for c, raises TypeError self.assertRaises(TypeError, f, [3], [6], 806)
def test_allow_input_downcast_int(self): a = tensor.wvector('a') # int16 b = tensor.bvector('b') # int8 c = tensor.bscalar('c') # int8 f = pfunc([a, b, c], (a + b + c), allow_input_downcast=True) # Value too big for a, b, or c, silently ignored assert f([2 ** 20], [1], 0) == 1 assert f([3], [312], 0) == 59 assert f([3], [1], 806) == 42 g = pfunc([a, b, c], (a + b + c), allow_input_downcast=False) # All values are in range. Since they're not ndarrays (but lists # or scalars), they will be converted, and their value checked. assert numpy.all(g([3], [6], 0) == 9) # Values are in range, but a dtype too large has explicitly been given # For performance reasons, no check of the data is explicitly performed # (It might be OK to change this in the future.) self.assertRaises(TypeError, g, [3], numpy.array([6], dtype='int16'), 0) # Value too big for b, raises TypeError self.assertRaises(TypeError, g, [3], [312], 0) h = pfunc([a, b, c], (a + b + c)) # Default: allow_input_downcast=None # Everything here should behave like with False assert numpy.all(h([3], [6], 0) == 9) self.assertRaises(TypeError, h, [3], numpy.array([6], dtype='int16'), 0) self.assertRaises(TypeError, h, [3], [312], 0)
def random_fn(self): """ Implements posterior distribution from initial latent space Parameters ---------- size : number of samples from distribution no_rand : whether use deterministic distribution Returns ------- posterior space (numpy) """ In = theano.In size = tt.iscalar('size') no_rand = tt.bscalar('no_rand') posterior = self.random(size, no_rand) fn = theano.function([In(size, 'size', 1, allow_downcast=True), In(no_rand, 'no_rand', 0, allow_downcast=True)], posterior) def inner(size=None, no_rand=False): if size is None: return fn(1, int(no_rand))[0] else: return fn(size, int(no_rand)) return inner
def __init__(self, n_features): self.n_features = n_features self.x = T.fvector("x") self.y = T.bscalar("y") self.W = theano.shared(rng.randn(n_features).astype( theano.config.floatX), name="W") self.b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
def get_cost(self): self.y = T.bscalar('y') self.L1 = abs(self.hidden_layer.W).sum() \ + abs(self.output_layer.W).sum() self.L2_sqr = (self.hidden_layer.W ** 2).sum() \ + (self.output_layer.W ** 2).sum() self.params = self.hidden_layer.params + self.output_layer.params self.cost = self.negative_log_likelihood(self.y) \ + self.L2_reg * self.L2_sqr #+ self.L1_reg * self.L1 return self.cost
def __init__(self, input=None, output=None, n_features=500, n_states=10, learning_rate=0.01): self.n_features = n_features self.n_states = n_states # x is a vector self.x = input if not self.x: self.x = T.fvector('x') # y is a label(0 1 2 3 ..) self.y = output if not self.y: self.y = T.bscalar('y') # test value #self.x.tag.test_value = rng.random(n_features).astype( # theano.config.floatX) #self.y.tag.test_value = 3 #self.y = T.cast(y, 'int32') self.b = theano.shared( numpy.zeros((n_states), dtype=theano.config.floatX), name='b', borrow=True, ) self.W = theano.shared( value=numpy.zeros((n_features, n_states), dtype=theano.config.floatX), name='W', borrow=True, ) self.p_y_given_x = T.nnet.softmax(T.dot(self.W, self.x) + self.b) # get the max index self.y_pred = T.argmax(self.p_y_given_x, axis=1) self.get_y_pred = theano.function(inputs=[self.x], allow_input_downcast=True, outputs=self.y_pred) self.learning_rate = learning_rate self.params = [self.W, self.b]
def test_replacements(binomial_model_inference): d = tt.bscalar() d.tag.test_value = 1 approx = binomial_model_inference.approx p = approx.model.p p_t = p**3 p_s = approx.apply_replacements(p_t) sampled = [p_s.eval() for _ in range(100)] assert any(map(operator.ne, sampled[1:], sampled[:-1])) # stochastic p_d = approx.apply_replacements(p_t, deterministic=True) sampled = [p_d.eval() for _ in range(100)] assert all(map(operator.eq, sampled[1:], sampled[:-1])) # deterministic p_r = approx.apply_replacements(p_t, deterministic=d) sampled = [p_r.eval({d: 1}) for _ in range(100)] assert all(map(operator.eq, sampled[1:], sampled[:-1])) # deterministic sampled = [p_r.eval({d: 0}) for _ in range(100)] assert any(map(operator.ne, sampled[1:], sampled[:-1])) # stochastic
def build_trainer(phi_shared, N, loglik_primary_f, logprior_f, hypernet_f, log_det_dtheta_dz_f=None, primary_f=None): '''It is assumed every time this is called z_noise will be drawn from a standard Gaussian. phi_shared are weights to hypernet and N is the total number of points in the data set.''' X = T.matrix('x') y = T.matrix('y') # Assuming multivariate output z_noise = T.vector('z') prelim = T.bscalar('prelim') lr = T.scalar('lr') elbo = hypernet_elbo(X, y, loglik_primary_f, logprior_f, hypernet_f, z_noise, N, log_det_dtheta_dz_f=log_det_dtheta_dz_f, prelim=prelim) loss = -elbo grads = T.grad(loss, phi_shared) updates = lasagne.updates.adam(grads, phi_shared, learning_rate=lr) trainer = theano.function([X, y, z_noise, lr, prelim], loss, updates=updates) # Build get_err in case you want to check Jacobian logic elbo_no_J = hypernet_elbo(X, y, loglik_primary_f, logprior_f, hypernet_f, z_noise, N, prelim=prelim) err = T.abs_(elbo - elbo_no_J) get_err = theano.function([X, y, z_noise, prelim], err) theta = hypernet_f(z_noise, prelim=prelim) theta_f = theano.function([z_noise, prelim], theta) test_loglik = loglik_primary_f(X, y, theta) test_f = theano.function([X, y, z_noise, prelim], test_loglik) primary_out = None if primary_f is not None: yp = primary_f(X, theta) primary_out = theano.function([X, z_noise, prelim], yp) grad_f = theano.function([X, y, z_noise, prelim], grads) return trainer, get_err, test_f, primary_out, grad_f, theta_f
def test_mlp(learning_rate=0.001, L1_reg=0.0, L2_reg=1, n_epochs=10000, dataset='Carolyn1_filt_turnclass.csv', batch_size=3, n_hidden=20): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset,'Carolyn1_filt_turnlabels.csv') train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels z = T.bscalar() rng = numpy.random.RandomState(1234) # construct the MLP class classifier = MLP( rng=rng, input=x, n_in=24, n_hidden=n_hidden, n_out=2, ts=z ) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = ( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr ) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size], } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size], } ) # start-snippet-5 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams) ] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size], } ) # end-snippet-5 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): classifier.ts = True minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set classifier.ts = False validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set classifier.ts = False test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr)
def __init__( self, dataset, learning_rate=0.001, decrease_constant=0, hidden_sizes=[500], random_seed=1234, batch_size=1, hidden_activation=T.nnet.sigmoid, use_cond_mask=False, direct_input_connect="None", direct_output_connect=False, update_rule="None", dropout_rate=0, weights_initialization="Uniform", mask_distribution=0, ): input_size = dataset["input_size"] self.shuffled_once = False class SeedGenerator(object): # This subclass purpose is to maximize randomness and still keep reproducibility def __init__(self, random_seed): self.rng = np.random.mtrand.RandomState(random_seed) def get(self): return self.rng.randint(42424242) self.seed_generator = SeedGenerator(random_seed) self.trng = T.shared_randomstreams.RandomStreams(self.seed_generator.get()) weights_initialization = getattr( WeightsInitializer(self.seed_generator.get()), weights_initialization ) # Get the weights initializer by string name # Building the model's graph input = T.matrix(name="input") target = T.matrix(name="target") is_train = T.bscalar(name="is_train") # Initialize the mask self.mask_generator = MaskGenerator(input_size, hidden_sizes, mask_distribution, self.seed_generator.get()) # Initialize layers input_layer = ConditionningMaskedLayer( layerIdx=0, input=input, n_in=input_size, n_out=hidden_sizes[0], activation=hidden_activation, weights_initialization=weights_initialization, mask_generator=self.mask_generator, use_cond_mask=use_cond_mask, ) self.layers = [dropoutLayerDecorator(input_layer, self.trng, is_train, dropout_rate)] # Now the hidden layers for i in range(1, len(hidden_sizes)): previous_layer = self.layers[i - 1] hidden_layer = DirectInputConnectConditionningMaskedLayer( layerIdx=i, input=previous_layer.output, n_in=hidden_sizes[i - 1], n_out=hidden_sizes[i], activation=hidden_activation, weights_initialization=weights_initialization, mask_generator=self.mask_generator, use_cond_mask=use_cond_mask, direct_input=input if direct_input_connect == "Full" and previous_layer.output != input else None, ) self.layers += [dropoutLayerDecorator(hidden_layer, self.trng, is_train, dropout_rate)] # And the output layer outputLayerIdx = len(self.layers) previous_layer = self.layers[outputLayerIdx - 1] self.layers += [ DirectOutputInputConnectConditionningMaskedOutputLayer( layerIdx=outputLayerIdx, input=previous_layer.output, n_in=hidden_sizes[outputLayerIdx - 1], n_out=input_size, activation=T.nnet.sigmoid, weights_initialization=weights_initialization, mask_generator=self.mask_generator, use_cond_mask=use_cond_mask, direct_input=input if (direct_input_connect == "Full" or direct_input_connect == "Output") and previous_layer.output != input else None, direct_outputs=[ (layer.layer_idx, layer.n_in, layer.input) for layerIdx, layer in enumerate(self.layers[1:-1]) ] if direct_output_connect else [], ) ] # The loss function output = self.layers[-1].output pre_output = self.layers[-1].lin_output log_prob = -T.sum(T.nnet.softplus(-target * pre_output + (1 - target) * pre_output), axis=1) loss = (-log_prob).mean() # How to update the parameters self.parameters = [param for layer in self.layers for param in layer.params] parameters_gradient = T.grad(loss, self.parameters) # Initialize update_rule if update_rule == "None": self.update_rule = DecreasingLearningRate(learning_rate, decrease_constant) elif update_rule == "adadelta": self.update_rule = AdaDelta(decay=decrease_constant, epsilon=learning_rate) elif update_rule == "adagrad": self.update_rule = AdaGrad(learning_rate=learning_rate) elif update_rule == "rmsprop": self.update_rule = RMSProp(learning_rate=learning_rate, decay=decrease_constant) elif update_rule == "adam": self.update_rule = Adam(learning_rate=learning_rate) elif update_rule == "adam_paper": self.update_rule = Adam_paper(learning_rate=learning_rate) updates = self.update_rule.get_updates(zip(self.parameters, parameters_gradient)) # How to to shuffle weights masks_updates = [layer_mask_update for layer in self.layers for layer_mask_update in layer.shuffle_update] self.update_masks = theano.function(name="update_masks", inputs=[], updates=masks_updates) # # Functions to train and use the model index = T.lscalar() self.learn = theano.function( name="learn", inputs=[index, is_train], outputs=loss, updates=updates, givens={ input: dataset["train"]["data"][index * batch_size : (index + 1) * batch_size], target: dataset["train"]["data"][index * batch_size : (index + 1) * batch_size], }, on_unused_input="ignore", ) # ignore for when dropout is absent self.use = theano.function( name="use", inputs=[input, is_train], outputs=output, on_unused_input="ignore" ) # ignore for when dropout is absent # Test functions self.valid_log_prob = theano.function( name="valid_log_prob", inputs=[is_train], outputs=log_prob, givens={input: dataset["valid"]["data"], target: dataset["valid"]["data"]}, on_unused_input="ignore", ) # ignore for when dropout is absent self.train_log_prob = theano.function( name="train_log_prob", inputs=[is_train], outputs=log_prob, givens={input: dataset["train"]["data"], target: dataset["train"]["data"]}, on_unused_input="ignore", ) # ignore for when dropout is absent self.train_log_prob_batch = theano.function( name="train_log_prob_batch", inputs=[index, is_train], outputs=log_prob, givens={ input: dataset["train"]["data"][index * 1000 : (index + 1) * 1000], target: dataset["train"]["data"][index * 1000 : (index + 1) * 1000], }, on_unused_input="ignore", ) # ignore for when dropout is absent self.test_log_prob = theano.function( name="test_log_prob", inputs=[is_train], outputs=log_prob, givens={input: dataset["test"]["data"], target: dataset["test"]["data"]}, on_unused_input="ignore", ) # ignore for when dropout is absent # Functions for verify gradient self.useloss = theano.function( name="useloss", inputs=[input, target, is_train], outputs=loss, on_unused_input="ignore" ) # ignore for when dropout is absent self.learngrad = theano.function( name="learn", inputs=[index, is_train], outputs=parameters_gradient, givens={ input: dataset["train"]["data"][index * batch_size : (index + 1) * batch_size], target: dataset["train"]["data"][index * batch_size : (index + 1) * batch_size], }, on_unused_input="ignore", ) # ignore for when dropout is absent
def evaluate_lenet5(learning_rate=0.1, n_epochs=200, dataset='mnist.pkl.gz', nkerns=[20, 50], batch_size=500): """ Demonstrates lenet on MNIST dataset :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer """ rng = numpy.random.RandomState(23455) datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' layers = [] srng = RandomStreams(25252) train_flag = T.bscalar('train_flag') # Reshape matrix of rasterized images of shape (batch_size, 28 * 28) # to a 4D tensor, compatible with our LeNetConvPoolLayer # (28, 28) is the size of MNIST images. layer0_input = x.reshape((batch_size, 1, 28, 28)) # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1 , 28-5+1) = (24, 24) # maxpooling reduces this further to (24/2, 24/2) = (12, 12) # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12) layer0 = ConvLayer(rng, data=layer0_input, image_shape=(batch_size, 1, 28, 28), filter_shape=(nkerns[0], 1, 5, 5)) layers.append(layer0) layer1 = PoolLayer(data=layer0.output) layers.append(layer1) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8) # maxpooling reduces this further to (8/2, 8/2) = (4, 4) # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4) layer2 = ConvLayer(rng, data=layer1.output, image_shape=(batch_size, nkerns[0], 12, 12), filter_shape=(nkerns[1], nkerns[0], 5, 5)) layers.append(layer2) layer3 = PoolLayer(data=layer2.output) layers.append(layer3) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. layerd_input = layer3.output.flatten(2) layerd = DropoutLayer(data=layerd_input, n_in=nkerns[1] * 4 * 4, srng=srng, p=.5, train_flag=train_flag) # construct a fully-connected sigmoidal layer layer4 = FCLayer(rng, data=layerd.output, n_in=nkerns[1] * 4 * 4, n_out=500, activation=relu) layers.append(layer4) # classify the values of the fully-connected sigmoidal layer layer5 = LogisticRegression(input=layer4.output, n_in=500, n_out=10) layers.append(layer5) # the cost we minimize during training is the NLL of the model cost = layer5.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], layer5.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size], train_flag: numpy.cast['int8'](0) }) validate_model = theano.function( [index], layer5.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size], train_flag: numpy.cast['int8'](0) }) # create a list of all model parameters to be fit by gradient descent params = layer5.params + layerd.params + layer4.params + layer3.params + \ layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. delta_before = [] for param_i in params: delta_before_i = theano.shared(value=numpy.zeros( param_i.get_value().shape, dtype=theano.config.floatX), borrow=True) delta_before.append(delta_before_i) learning_rate = 0.01 momentum = 0.9 weight_decay = 0.0005 updates = [] for param_i, grad_i, delta_before_i in zip(params, grads, delta_before): delta_i = momentum * delta_before_i - weight_decay * learning_rate * param_i - learning_rate * grad_i updates.append((delta_before_i, delta_i)) updates.append((param_i, param_i + delta_i)) train_model = theano.function( [index], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], train_flag: numpy.cast['int8'](1) }) # end-snippet-1 ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False if gpu_usage is True: nvmlInit() handle = nvmlDeviceGetHandleByIndex(0) info = nvmlDeviceGetMemoryInfo(handle) print "Total memory:", info.total print "Free memory:", info.free while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 100 == 0: print 'training @ iter = ', iter cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in xrange(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in xrange(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print('Optimization complete.') print( 'Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.))
def __init__(self, rng, n_in, n_out, n_h, dropout=0, sigma_g=sigmoid, sigma_c=hyperbolic_tangent, sigma_h=hyperbolic_tangent, sigma_y=softmax, dropout_rate=0, obj='c'): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param sigma_g, sigma_c, sigma_h, sigma_y: activation functions :param dropout_rate: dropout rate (float) :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r']) ''' Wf_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) Uf_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) bf_ = np.zeros(n_h) Wi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) Ui_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) bi_ = np.zeros(n_h) Wo_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) Uo_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) bo_ = np.zeros(n_h) Wc_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) Uc_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) bc_ = np.zeros(n_h) Wy_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)), np.sqrt(6. / (n_out + n_h)), (n_out, n_h)) by_ = np.zeros(n_out) h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) c0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) # Theano: Created shared variables Wf = theano.shared(name='Wf', value=Wf_.astype(theano.config.floatX)) Uf = theano.shared(name='Uf', value=Uf_.astype(theano.config.floatX)) bf = theano.shared(name='bf', value=bf_.astype(theano.config.floatX)) Wi = theano.shared(name='Wi', value=Wi_.astype(theano.config.floatX)) Ui = theano.shared(name='Ui', value=Ui_.astype(theano.config.floatX)) bi = theano.shared(name='bi', value=bi_.astype(theano.config.floatX)) Wo = theano.shared(name='Wo', value=Wo_.astype(theano.config.floatX)) Uo = theano.shared(name='Uo', value=Uo_.astype(theano.config.floatX)) bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX)) Wc = theano.shared(name='Wc', value=Wc_.astype(theano.config.floatX)) Uc = theano.shared(name='Uc', value=Uc_.astype(theano.config.floatX)) bc = theano.shared(name='bc', value=bc_.astype(theano.config.floatX)) Wy = theano.shared(name='Wy', value=Wy_.astype(theano.config.floatX)) by = theano.shared(name='by', value=by_.astype(theano.config.floatX)) h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX)) c0 = theano.shared(name='c0', value=c0_.astype(theano.config.floatX)) self.p = [ Wf, Uf, bf, Wi, Ui, bi, Wo, Uo, bo, Wc, Uc, bc, Wy, by, c0, h0 ] seq_len = T.iscalar('seq_len') self.seq_len = seq_len self.x = T.vector() x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2) if dropout_rate > 0: np.random.seed(int(time.time())) # for training def masked_forward_prop_step(x_t, h_t_prev, c_t_prev): f_t = sigma_g(Wf.dot(x_t) + Uf.dot(h_t_prev) + bf) i_t = sigma_g(Wi.dot(x_t) + Ui.dot(h_t_prev) + bi) o_t = sigma_g(Wo.dot(x_t) + Uo.dot(h_t_prev) + bo) c_t = i_t * sigma_c(Wc.dot(x_t) + Uc.dot(h_t_prev) + bc) c_t += c_t_prev * f_t h_t = o_t * sigma_h(c_t) y_t = Wy.dot(h_t) + by mask = np.random.binomial(np.ones(n_h, dtype=int), 1.0 - dropout_rate) masked_h_t = h_t * T.cast(mask, theano.config.floatX) return [y_t, masked_h_t, c_t] # for testing def forward_prop_step(x_t, h_t_prev, c_t_prev): f_t = sigma_g(Wf.dot(x_t) + Uf.dot(h_t_prev) + bf) i_t = sigma_g(Wi.dot(x_t) + Ui.dot(h_t_prev) + bi) o_t = sigma_g(Wo.dot(x_t) + Uo.dot(h_t_prev) + bo) c_t = i_t * sigma_c(Wc.dot(x_t) + Uc.dot(h_t_prev) + bc) c_t += c_t_prev * f_t h_t = o_t * sigma_h(c_t) h_t = (1.0 - dropout_rate) * h_t y_t = Wy.dot(h_t) + by return [y_t, h_t, c_t] [o_train, _, _], _ = theano.scan(masked_forward_prop_step, sequences=[x_scan], outputs_info=[None, h0, c0], n_steps=seq_len) [o_test, _, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0, c0], n_steps=seq_len) else: def forward_prop_step(x_t, h_t_prev, c_t_prev): f_t = sigma_g(Wf.dot(x_t) + Uf.dot(h_t_prev) + bf) i_t = sigma_g(Wi.dot(x_t) + Ui.dot(h_t_prev) + bi) o_t = sigma_g(Wo.dot(x_t) + Uo.dot(h_t_prev) + bo) c_t = i_t * sigma_c(Wc.dot(x_t) + Uc.dot(h_t_prev) + bc) c_t += c_t_prev * f_t h_t = o_t * sigma_h(c_t) y_t = Wy.dot(h_t) + by return [y_t, h_t, c_t] [o_train, _, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0, c0], n_steps=seq_len) o_test = o_train if obj == 'c': # classification task self.y = T.bscalar('y') self.o_train = sigma_y(o_train[-1]) self.o_test = sigma_y(o_test[-1]) #obj function to compute grad, use dropout self.cost = T.nnet.categorical_crossentropy( self.o_train, T.eye(n_out)[self.y]) #compute accuracy use average of dropout rate self.accuracy = T.switch(T.eq(T.argmax(self.o_test), self.y), 1., 0.) self.prediction = np.argmax(self.o_test) elif obj == 'r': # regression task self.y = T.dscalar('y') self.o_train = o_train[-1] self.o_test = o_test[-1] #obj function to compute grad, use dropout self.cost = (self.o_train[0] - self.y)**2 #compute accuracy use average of dropout rate self.accuracy = (self.o_test[0] - self.y)**2 self.prediction = self.o_test[0] self.optimiser = sgd_optimizer(self, 'LSTM')
parse_mode=opts.parse_mode) log("train_stats %s %s" % (len(train_x), train_stats)) dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab, update_vocab=False, max_egs=int(opts.num_from_dev), parse_mode=opts.parse_mode) log("dev_stats %s %s" % (len(dev_x), dev_stats)) # input/output example vars s1_idxs = T.ivector('s1') # sequence for sentence one s2_idxs = T.ivector('s2') # sequence for sentence two actual_y = T.ivector('y') # single for sentence pair label; 0, 1 or 2 # dropout keep prob for post concat, pre MLP apply_dropout = T.bscalar( 'apply_dropout') # dropout.{APPLY_DROPOUT|NO_DROPOUT} keep_prob = theano.shared(opts.keep_prob) # recall 1.0 => noop keep_prob = T.cast(keep_prob, 'float32') # shared weirdity, how to set in init (?) # keep track of different "layers" that handle their own gradients. # includes rnns, final concat & softmax and, potentially, special handling for # tied embeddings layers = [] # decide set of sequence idxs we'll be processing. there will always the two # for the forward passes over s1 and s2 and, optionally, two more for the # reverse pass over s1 & s2 in the bidirectional case. idxs = [s1_idxs, s2_idxs] names = ["s1f", "s2f"] if opts.bidirectional:
non_sequences=[n_words, word_dim], sequences=[index1, index2], outputs_info=[result_mat], ) return (output[-1]) def sents_ind_2vec(self, sents): # Create Input moddule contain positional encoding scheme # the input sents presents the index of words # this will convert each fact into a vector as output shape_input = sents.shape bach_size, n_sents, n_words = shape_input positional_encode_matrix = self.positional_encoding_scheme(n_words) p_e_m_shuffle = positional_encode_matrix.dimshuffle("x", "x", 0, 1) sents_emb = self.words_ind_2vec(sents) * p_e_m_shuffle return (sents_emb.sum(axis=2)) # Debug ONLY if __name__ == "__main__": rng = np.random.RandomState(220495) arrSents = T.itensor3() nn = T.bscalar() EMBD = EncodingLayer(32, 10, rng=rng) Word2Vec = theano.function(inputs=[arrSents], outputs=EMBD.sents_ind_2vec(arrSents)) sents = [[[3, 14, 0], [0, 0, 0]], [[3, 14, 0], [1, 2, 6]]] Vec = Word2Vec(sents) print("Val: ", Vec) print("Dim: ", Vec.shape)
def test_mlp(learning_rate=0.01, L1_reg=0.000, L2_reg=0.0002, n_epochs=10000, dataset='data_nn.csv', batch_size=15, n_hidden1=200, n_hidden2=100): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets, grad_test = load_data(dataset,'theano_labels.csv') train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] // batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] // batch_size ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels z = T.bscalar() rng = numpy.random.RandomState(1234) # construct the MLP class classifier = MLP( rng=rng, input=x, n_in=96, n_hidden1=n_hidden1, n_hidden2=n_hidden2, n_out=4, ts=z ) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = ( classifier.crossentropy(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr ) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size], } ) validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size], } ) # start-snippet-5 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] #ginput = T.grad(cost, classifier.input) ginput = theano.gradient.jacobian(cost, classifier.input) #J, updates = theano.scan(lambda i, y,x : T.grad(y[i], x), sequences=T.arange(y.shape[0]), non_sequences=[y,x]) # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [ (param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams) ] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size], } ) #get_gradient = theano.function( #outputs=ginput, #givens={ # x: train_set_x[0], # y: train_set_y[0], #} #) # end-snippet-5 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience // 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): classifier.ts = True minibatch_avg_cost = train_model(minibatch_index) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set classifier.ts = False validation_losses = [validate_model(i) for i in range(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print( 'epoch %i, minibatch %i/%i, validation error %f %%' % ( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100. ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if ( this_validation_loss < best_validation_loss * improvement_threshold ): patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter # test it on the test set classifier.ts = False test_losses = [test_model(i) for i in range(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), file=sys.stderr) f = theano.function([classifier.input, y], ginput) print (pp(f.maker.fgraph.outputs[0])) theano.printing.pydotprint(f.maker.fgraph.outputs[0]) test_in = train_set = theano.shared([-0.85932366, 0.58308557, 0.57291266, -2.73567018, 0.5720682 , 0.69788969, 0.01782218, 0.89483408, 3.28290884, -0.38769847, -0.76087236, -0.50285087, -1.24656495, -0.73529842, -0.99193669, -1.95702179, 1.57430759, 0.24463588, 3.06210202, 2.45264677, -0.25134517, -0.04829522, -0.55535032, 0.0503641 , -1.91432708, 0.77470853, 0.7401515 , -2.71915318, 0.4963475 , 1.00522374, 0.27958163, -1.35574041, 0.58434732, -0.67177877, -1.07827181, -2.11205369, -1.48408336, -0.80823029, -0.95141343, -1.98320406, 1.46513513, -0.09303374, -0.76959049, 0.85930122, -0.86362607, -0.78979288, -0.0444948 , 0.45982332, -2.31018994, 0.85091726, 0.77935362, -2.70620804, 0.44422539, 1.24119296, 0.09150836, -2.86868139, -1.16801813, -0.50896104, -0.05604379, -0.57235696, -1.08455522, -1.17935154, -1.12812324, -1.9744183 , 0.19983282, -0.11654747, -1.15473115, -0.07447867, -0.28972877, -0.94642741, 0.26084976, 0.46156281, 2.1851348 , -0.77191925, -0.73766559, 1.8115434 , 0.83390925, -0.91492798, 0.06507779, 2.07655773, 2.62112977, 0.04236459, -0.34407471, -0.03113814, 0.67895545, 1.1023399 , 0.77840311, 1.18688628, 1.31362216, 0.86287225, 2.23127128, 1.32033075, 0.07084121, 0.45882767, -0.52361762, -0.24316931]) test_out = theano.shared(numpy.asarray(4,dtype=theano.config.floatX)) print (f([test_in,test_out]))
def sgd_optimization_mnist(learning_rate=0.2, n_epochs=1000, batch_size=5): datasets = load_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' index = T.bscalar() x = T.matrix('x') y = T.ivector('y') classifier = LogisticRegression(input=x, n_in=img_size, n_out=9) cost = classifier.negative_log_likelihood(y) test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) g_W = T.grad(cost=cost, wrt=classifier.W) g_b = T.grad(cost=cost, wrt=classifier.b) updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training the model' patience = 5000 patience_increase = 2 improvement_threshold = 0.995 validation_frequency = min(n_train_batches, patience / 2) best_params = None best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % \ (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) if this_validation_loss < best_validation_loss: if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of best' ' model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = time.clock() print(('Optimization complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) print 'The code run for %d epochs, with %f epochs/sec' % ( epoch, 1. * epoch / (end_time - start_time)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time)))
import theano import theano.tensor as tensor #global variables to use to toggle training and rng, etc from theano.sandbox.rng_mrg import MRG_RandomStreams layer_train_rng = MRG_RandomStreams() layer_train_enable = tensor.bscalar() layer_train_epoch = tensor.iscalar() layer_train_it = theano.shared(0) def get_rng(): global layer_train_rng return layer_train_rng def set_rng_seed(v): global layer_train_rng layer_train_rng.seed(v) def get_train(): global layer_train_enable return layer_train_enable def get_epoch(): global layer_train_epoch return layer_train_epoch
def __init__(self, dataset, learning_rate=0.001, decrease_constant=0, hidden_sizes=[500], random_seed=1234, batch_size=1, hidden_activation=T.nnet.sigmoid, use_cond_mask=False, direct_input_connect="None", direct_output_connect=False, update_rule="None", dropout_rate=0, weights_initialization="Uniform", mask_distribution=0): input_size = dataset['input_size'] self.shuffled_once = False class SeedGenerator(object): # This subclass purpose is to maximize randomness and still keep reproducibility def __init__(self, random_seed): self.rng = np.random.mtrand.RandomState(random_seed) def get(self): return self.rng.randint(42424242) self.seed_generator = SeedGenerator(random_seed) self.trng = T.shared_randomstreams.RandomStreams( self.seed_generator.get()) weights_initialization = getattr( WeightsInitializer(self.seed_generator.get()), weights_initialization ) # Get the weights initializer by string name # Building the model's graph input = T.matrix(name="input") target = T.matrix(name="target") is_train = T.bscalar(name="is_train") # Initialize the mask self.mask_generator = MaskGenerator(input_size, hidden_sizes, mask_distribution, self.seed_generator.get()) # Initialize layers input_layer = ConditionningMaskedLayer( layerIdx=0, input=input, n_in=input_size, n_out=hidden_sizes[0], activation=hidden_activation, weights_initialization=weights_initialization, mask_generator=self.mask_generator, use_cond_mask=use_cond_mask) self.layers = [ dropoutLayerDecorator(input_layer, self.trng, is_train, dropout_rate) ] # Now the hidden layers for i in range(1, len(hidden_sizes)): previous_layer = self.layers[i - 1] hidden_layer = DirectInputConnectConditionningMaskedLayer( layerIdx=i, input=previous_layer.output, n_in=hidden_sizes[i - 1], n_out=hidden_sizes[i], activation=hidden_activation, weights_initialization=weights_initialization, mask_generator=self.mask_generator, use_cond_mask=use_cond_mask, direct_input=input if direct_input_connect == "Full" and previous_layer.output != input else None) self.layers += [ dropoutLayerDecorator(hidden_layer, self.trng, is_train, dropout_rate) ] # And the output layer outputLayerIdx = len(self.layers) previous_layer = self.layers[outputLayerIdx - 1] self.layers += [ DirectOutputInputConnectConditionningMaskedOutputLayer( layerIdx=outputLayerIdx, input=previous_layer.output, n_in=hidden_sizes[outputLayerIdx - 1], n_out=input_size, activation=T.nnet.sigmoid, weights_initialization=weights_initialization, mask_generator=self.mask_generator, use_cond_mask=use_cond_mask, direct_input=input if (direct_input_connect == "Full" or direct_input_connect == "Output") and previous_layer.output != input else None, direct_outputs=[ (layer.layer_idx, layer.n_in, layer.input) for layerIdx, layer in enumerate(self.layers[1:-1]) ] if direct_output_connect else []) ] # The loss function output = self.layers[-1].output pre_output = self.layers[-1].lin_output log_prob = -T.sum(T.nnet.softplus(-target * pre_output + (1 - target) * pre_output), axis=1) loss = (-log_prob).mean() # How to update the parameters self.parameters = [ param for layer in self.layers for param in layer.params ] parameters_gradient = T.grad(loss, self.parameters) # Initialize update_rule if update_rule == "None": self.update_rule = DecreasingLearningRate(learning_rate, decrease_constant) elif update_rule == "adadelta": self.update_rule = AdaDelta(decay=decrease_constant, epsilon=learning_rate) elif update_rule == "adagrad": self.update_rule = AdaGrad(learning_rate=learning_rate) elif update_rule == "rmsprop": self.update_rule = RMSProp(learning_rate=learning_rate, decay=decrease_constant) elif update_rule == "adam": self.update_rule = Adam(learning_rate=learning_rate) elif update_rule == "adam_paper": self.update_rule = Adam_paper(learning_rate=learning_rate) updates = self.update_rule.get_updates( zip(self.parameters, parameters_gradient)) # How to to shuffle weights masks_updates = [ layer_mask_update for layer in self.layers for layer_mask_update in layer.shuffle_update ] self.update_masks = theano.function(name='update_masks', inputs=[], updates=masks_updates) # # Functions to train and use the model index = T.lscalar() self.learn = theano.function( name='learn', inputs=[index, is_train], outputs=loss, updates=updates, givens={ input: dataset['train']['data'][index * batch_size:(index + 1) * batch_size], target: dataset['train']['data'][index * batch_size:(index + 1) * batch_size] }, on_unused_input='ignore') # ignore for when dropout is absent self.use = theano.function( name='use', inputs=[input, is_train], outputs=output, on_unused_input='ignore') # ignore for when dropout is absent # Test functions self.valid_log_prob = theano.function( name='valid_log_prob', inputs=[is_train], outputs=log_prob, givens={ input: dataset['valid']['data'], target: dataset['valid']['data'] }, on_unused_input='ignore') # ignore for when dropout is absent self.train_log_prob = theano.function( name='train_log_prob', inputs=[is_train], outputs=log_prob, givens={ input: dataset['train']['data'], target: dataset['train']['data'] }, on_unused_input='ignore') # ignore for when dropout is absent self.train_log_prob_batch = theano.function( name='train_log_prob_batch', inputs=[index, is_train], outputs=log_prob, givens={ input: dataset['train']['data'][index * 1000:(index + 1) * 1000], target: dataset['train']['data'][index * 1000:(index + 1) * 1000] }, on_unused_input='ignore') # ignore for when dropout is absent self.test_log_prob = theano.function( name='test_log_prob', inputs=[is_train], outputs=log_prob, givens={ input: dataset['test']['data'], target: dataset['test']['data'] }, on_unused_input='ignore') # ignore for when dropout is absent # Functions for verify gradient self.useloss = theano.function( name='useloss', inputs=[input, target, is_train], outputs=loss, on_unused_input='ignore') # ignore for when dropout is absent self.learngrad = theano.function( name='learn', inputs=[index, is_train], outputs=parameters_gradient, givens={ input: dataset['train']['data'][index * batch_size:(index + 1) * batch_size], target: dataset['train']['data'][index * batch_size:(index + 1) * batch_size] }, on_unused_input='ignore') # ignore for when dropout is absent
def test_mlp( learning_rate=0.01, L1_reg=0.00, L2_reg=0.0001, cor_reg=0.00, cor_scaling=1.0, rand_seed=1234, dropout=False, n_epochs=1000, dataset="mnist.pkl.gz", batch_size=20, n_hidden=500, save_correlations=False, ): """ Demonstrate stochastic gradient descent optimization for a multilayer perceptron This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print "... building the model" # allocate symbolic variables for the data cor_reg_var = theano.shared(cor_reg) # symbolic variable storing cor_reg value alpha = T.dscalar("alpha") # scaling factor for weight decay is_train = T.bscalar("is_train") # boolean for switching between training and prediction index = T.lscalar() # index to a [mini]batch perm = T.lvector() # permutation of the indices of the training samples x = T.matrix("x") # the data is presented as rasterized images y = T.ivector("y") # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(rand_seed) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=28 * 28, n_hidden=n_hidden, n_out=10, dropout=dropout) # start-snippet-4 # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically if cor_reg == 0: cost = classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr else: cost = ( classifier.negative_log_likelihood(y) + L1_reg * classifier.L1 + L2_reg * classifier.L2_sqr + cor_reg_var * classifier.cor_sqr_sum ) # end-snippet-4 # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch if save_correlations: validate_model = theano.function( inputs=[index], outputs=[classifier.errors(y), classifier.activation_correlation], givens={ x: valid_set_x[index * batch_size : (index + 1) * batch_size], y: valid_set_y[index * batch_size : (index + 1) * batch_size], is_train: np.cast["int8"](0), }, ) else: validate_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size : (index + 1) * batch_size], y: valid_set_y[index * batch_size : (index + 1) * batch_size], is_train: np.cast["int8"](0), }, ) # start-snippet-5 # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [T.grad(cost, param) for param in classifier.params] # specify how to update the parameters of the model as a list of # (variable, update expression) pairs # given two lists of the same length, A = [a1, a2, a3, a4] and # B = [b1, b2, b3, b4], zip generates a list C of same size, where each # element is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] updates = [(param, param - learning_rate * gparam) for param, gparam in zip(classifier.params, gparams)] # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` train_model = theano.function( inputs=[index, perm], outputs=cost, updates=updates, givens={ x: train_set_x[perm[index * batch_size : (index + 1) * batch_size]], y: train_set_y[perm[index * batch_size : (index + 1) * batch_size]], is_train: np.cast["int8"](1), }, ) # end-snippet-5 # update the symbolic cor_reg variable update_cor_reg = theano.function(inputs=[alpha], outputs=cor_reg_var, updates=[(cor_reg_var, cor_reg_var * alpha)]) ############### # TRAIN MODEL # ############### print "... training" best_validation_loss = np.inf best_epoch = 0 start_time = timeit.default_timer() # Open file for writing validation losses, and write the header valid_loss_filename = "ValidationLoss_Epoch%i_Batch%i_Cor%f_Drop%i_Scale%f.csv" % ( n_epochs, n_epochs * n_train_batches, cor_reg, dropout, cor_scaling, ) valid_loss_filepath = os.path.join(os.path.split(__file__)[0], "..", "output", "MLP", valid_loss_filename) valid_loss_outfile = open(valid_loss_filepath, "w") valid_loss_outfile.write("Epoch,Iteration,Error\n") if save_correlations: flat_corr_filename = "FlatCorrelations_Epoch%i_Batch%i_Cor%f_Drop%i_Scale%f.csv" % ( n_epochs, n_epochs * n_train_batches, cor_reg, dropout, cor_scaling, ) flat_corr_filepath = os.path.join(os.path.split(__file__)[0], "..", "output", "MLP", flat_corr_filename) flat_corr_outfile = open(flat_corr_filepath, "w") epoch = 0 while epoch < n_epochs: epoch += 1 index_perm = rng.permutation(train_set_x.get_value(borrow=True).shape[0]) # generate new permutation of indices # perform 1 epoch of training for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index, index_perm) print "Hidden layer after training:\n" print classifier.hiddenLayer.output.get_value() # compute zero-one loss on validation set if save_correlations: # compute and save the average pairwise correlations validation_losses = [] mean_correlations = 0 # contains mean correlation matrix once loop is finished for i in xrange(n_valid_batches): valid_loss, valid_corr = validate_model(i) validation_losses.append(valid_loss) mean_correlations += 1.0 * valid_corr / n_valid_batches # iteratively constructs mean to save memory this_validation_loss = np.mean(validation_losses) flat_mean_correlation = flatten_correlation_matrix(mean_correlations) flat_corr_outfile.write(str(epoch) + "," + ",".join(map(str, flat_mean_correlation)) + "\n") else: validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) # Write this epoch's validation error to the file valid_loss_outfile.write(("%i,%i,%f\n") % (epoch, epoch * n_train_batches, this_validation_loss)) # ********COMMENT THIS OUT WHEN RUNNING MULTIPLE PARAMS OVERNIGHT******** print ( "epoch %i (iteration %i), validation error %f %%, cor_reg %f" % (epoch, epoch * n_train_batches, this_validation_loss * 100.0, cor_reg_var.get_value()) ) # current_time = timeit.default_timer() # print('epoch %i (iteration %i), validation error %f %%, cor_reg %f, time elapsed %.2fm' % (epoch, epoch*n_train_batches, this_validation_loss * 100., cor_reg_var.get_value(), (current_time - start_time) / 60.)) print "Hidden layer after validation:\n" print classifier.hiddenLayer.output.get_value() # if we got the best validation score until now if this_validation_loss < best_validation_loss: best_validation_loss = this_validation_loss best_epoch = epoch # Update the value of cor_reg for the next epoch # Only makes a difference when cor_scaling != 1, because multiplication if cor_scaling != 1: old_cor_reg = update_cor_reg(cor_scaling) valid_loss_outfile.close() if save_correlations: flat_corr_outfile.close() end_time = timeit.default_timer() print ( ("Optimization complete. Best validation score of %f %% " "obtained following epoch %i (iteration %i)") % (best_validation_loss * 100.0, best_epoch, best_epoch * n_train_batches) ) print "Training process ran for %.2fm" % ((end_time - start_time) / 60.0)
def __init__(self): super(DropoutModel, self).__init__() self.is_train = T.bscalar('is_train')
def optimiser(self, num_samples, update, update_kwargs, saved_update=None): latent_dim = T.bscalar('latent_dim') batch = T.matrix('batch') batch_rep = T.repeat(batch, num_samples, axis=0) h_regular_rep = self.recognition_model.get_samples_latents_regular( batch_rep) h_overdisp_rep = self.recognition_model.get_samples_latents_overdisp( batch_rep, latent_dim) h_rep = T.set_subtensor(h_regular_rep[:, latent_dim], h_overdisp_rep) log_p_h = self.generative_model.log_p_h(h_rep) log_p_x = self.generative_model.log_p_x(h_rep, batch_rep) entropies_h = self.recognition_model.entropies_latents( h_rep, batch_rep) imp_wts = self.recognition_model.importance_weights_latents( h_overdisp_rep, batch_rep, latent_dim) elbos_rep = imp_wts * (log_p_h + log_p_x + entropies_h) elbos_matrix = elbos_rep.reshape((batch.shape[0], num_samples)) elbo = T.sum(T.mean(elbos_matrix, axis=1, keepdims=True)) params = self.generative_model.get_params( ) + self.recognition_model.get_params()[:-1] grads = T.grad(-elbo, params) tau = self.recognition_model.get_params()[-1] all_grads, _ = theano.scan( lambda s, E: T.grad(-T.sum(E[s]), params), sequences=[T.arange(elbos_matrix.T.shape[0])], non_sequences=[elbos_matrix.T], ) variance = T.sum([T.sum(T.var(g, axis=0)) for g in all_grads]) grad_tau = T.grad(variance, tau) grads += [grad_tau] params = self.generative_model.get_params( ) + self.recognition_model.get_params() update_kwargs['loss_or_grads'] = grads update_kwargs['params'] = params updates = update(**update_kwargs) if saved_update is not None: for u, v in zip(updates, saved_update.keys()): u.set_value(v.get_value()) optimiser = theano.function( inputs=[batch, latent_dim], outputs=elbo, updates=updates, ) return optimiser, updates
def __init__(self, rng, n_in, n_out, n_h, n_layers, f_act=leaky_relu, obj='single', dropout_rate = 0): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param n_layers: Number of hidden layers (int) :param f_act: Hidden-to-hidden activation function :param f_out: Output activation function ''' if obj=='single': f_out = softmax elif obj=='multi': f_out = sigmoid self.x = T.vector() # construct hidden layers assert(n_layers>=1) first_hiddenLayer = HiddenLayer( rng=rng, input=self.x, predict_input=self.x, n_in=n_in, n_out=n_h, activation=f_act, dropout_rate = dropout_rate, nametag='0' ) self.hidden_layers = [first_hiddenLayer] self.p = first_hiddenLayer.params[:] for i in range(n_layers-1): cur_hiddenLayer = ResNetLayer( rng=rng, input=self.hidden_layers[-1].output, predict_input=self.hidden_layers[-1].predict_output, n_h=n_h, activation=f_act, dropout_rate = dropout_rate, nametag=str(i+1) ) self.hidden_layers.append(cur_hiddenLayer) self.p.extend(cur_hiddenLayer.params[:]) # params for output layer self.outputLayer = HiddenLayer( rng=rng, input=self.hidden_layers[-1].output, predict_input=self.hidden_layers[-1].predict_output, n_in=n_h, n_out=n_out, activation=f_out, dropout_rate = 0, nametag='o' ) self.p.extend(self.outputLayer.params[:]) self.n_layers = n_layers + 1 self.obj = obj if obj=='single': self.y = T.bscalar('y') self.o = self.outputLayer.output self.cost = T.nnet.categorical_crossentropy(self.o, T.eye(n_out)[self.y]) self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.) self.prediction = np.argmax(self.o) elif obj=='multi': self.y = T.bvector('y') self.o = self.outputLayer.output self.cost = T.nnet.binary_crossentropy(self.o, self.y).mean() self.prediction = T.argsort(self.o) self.accuracy = self.y[T.argmax(self.o)] self.accuracy3 = (1.0/3.0) * (self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]]) self.accuracy5 = (1.0/5.0) * (self.y[self.prediction[-5]]+self.y[self.prediction[-4]]+self.y[self.prediction[-3]]+self.y[self.prediction[-2]]+self.y[self.prediction[-1]]) self.optimiser = sgd_optimizer(self, 'ResNet')
def SGD(self, training_data, no_improvement_in, mini_batch_size, eta, validation_data, test_data, lmbda=0.0, monitor_test=False): """Train the network using mini-batch stochastic gradient descent.""" training_x, training_y = training_data validation_x, validation_y = validation_data test_x, test_y = test_data # compute number of minibatches for training, validation and testing num_training_batches = int(size(training_data) / mini_batch_size) num_validation_batches = int(size(validation_data) / mini_batch_size) num_test_batches = int(size(test_data) / mini_batch_size) # define the (regularized) cost function, symbolic gradients, and updates l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers]) cost = self.layers[-1].cost(self)+\ 0.5*lmbda*l2_norm_squared/num_training_batches grads = T.grad(cost, self.params) updates = [(param, param - eta * grad) for param, grad in zip(self.params, grads)] # define functions to train a mini-batch, and to compute the # accuracy in validation and test mini-batches. i = T.lscalar() # mini-batch index n_class = T.bscalar() # number of the class train_mb = theano.function( [i], cost, updates=updates, givens={ self.x: training_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size], self.y: training_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }) validate_mb_accuracy = theano.function( [i], self.layers[-1].accuracy(self.y), givens={ self.x: validation_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size], self.y: validation_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }) test_mb_accuracy = theano.function( [i], self.layers[-1].accuracy(self.y), givens={ self.x: test_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size], self.y: test_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }) test_mb_accuracies_by_class = theano.function( [i, n_class], self.layers[-1].accuracies_by_class(self.y, n_class), givens={ self.x: test_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size], self.y: test_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }, on_unused_input='ignore') test_mb_per_by_class = theano.function( [i, n_class], self.layers[-1].per_by_class(self.y, n_class), givens={ self.x: test_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size], self.y: test_y[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }, on_unused_input='ignore') self.test_mb_predictions = theano.function( [i], self.layers[-1].y_out, givens={ self.x: test_x[i * self.mini_batch_size:(i + 1) * self.mini_batch_size] }) # Do the actual training best_test_accuracy = 0.0 best_validation_accuracy = 0.0 epoch = 0 it = -1 #Keep trace of the test accuracy if monitor_test: test_acc = [] test_acc_by_class = [] while True: it += 1 epoch += 1 for minibatch_index in range(num_training_batches): iteration = num_training_batches * epoch + minibatch_index cost_ij = train_mb(minibatch_index) if (iteration + 1) % num_training_batches == 0: validation_accuracy = np.mean([ validate_mb_accuracy(j) for j in range(num_validation_batches) ]) if validation_accuracy >= best_validation_accuracy: #print("This is the best total validation accuracy to date.") best_validation_accuracy = validation_accuracy best_iteration = iteration test_accuracies_by_class = [0] * self.layers[-1].n_out test_per_by_class = [0] * self.layers[-1].n_out it = -1 if test_data: test_accuracy = np.mean([ test_mb_accuracy(j) for j in range(num_test_batches) ]) if monitor_test: test_acc.append(test_accuracy) if test_accuracy > best_test_accuracy: best_test_accuracy = test_accuracy best_iteration_acc = iteration # If you want to track visually your progress, uncomment the following lines: #print("Epoch {0}: test accuracy {1:.2%}".format( # epoch, test_accuracy)) if monitor_test: for i in range(self.layers[-1].n_out): test_accuracies_by_class[i] = np.mean([test_mb_accuracies_by_class(j,i) for j in range(num_test_batches)]) / \ np.mean([test_mb_per_by_class(j,i) for j in range(num_test_batches)]) test_acc_by_class.append(test_accuracies_by_class) if it >= no_improvement_in: break print("Finished training network after {} epochs.".format(epoch)) print("Best test accuracy of {0:.2%} obtained at iteration {1}".format( best_test_accuracy, best_iteration_acc)) print("Best validation accuracy of {0:.2%} obtained at iteration {1}". format(best_validation_accuracy, best_iteration)) if monitor_test: return test_acc, np.transpose(test_acc_by_class)
def __init__(self, rng, n_in, n_out, n_h, f_act=leaky_relu, f_out=softmax, orth_init=True, dropout_rate=0, obj='c'): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param f_act: Hidden-to-hidden activation function :param f_out: Output activation function :param orth_init: if true, the initialize transition matrix to be orthogonal (bool) :param dropout_rate: dropout rate (float) :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r']) ''' if orth_init: Whh_ = rvs(rng, n_h) else: Whh_ = rng.uniform(-np.sqrt(6. / (n_h + n_h)), np.sqrt(6. / (n_h + n_h)), (n_h, n_h)) Whi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) bh_ = np.zeros(n_h) Woh_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)), np.sqrt(6. / (n_h + n_out)), (n_out, n_h)) bo_ = np.zeros(n_out) h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) # Theano: Created shared variables Whh = theano.shared(name='Whh', value=Whh_.astype(theano.config.floatX)) Whi = theano.shared(name='Whi', value=Whi_.astype(theano.config.floatX)) bh = theano.shared(name='bh', value=bh_.astype(theano.config.floatX)) Woh = theano.shared(name='Woh', value=Woh_.astype(theano.config.floatX)) bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX)) h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX)) self.p = [Whh, Whi, Woh, bh, bo, h0] seq_len = T.iscalar('seq_len') self.seq_len = seq_len self.dropout_rate = dropout_rate self.x = T.vector() x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2) if dropout_rate > 0: np.random.seed(int(time.time())) # for training def masked_forward_prop_step(x_t, h_t_prev): h_t = f_act(Whi.dot(x_t) + Whh.dot(h_t_prev) + bh) o_t = Woh.dot(h_t) + bo mask = np.random.binomial(np.ones(n_h, dtype=int), 1 - dropout_rate) masked_h_t = h_t * T.cast(mask, theano.config.floatX) return [o_t, masked_h_t] # for testing def forward_prop_step(x_t, h_t_prev): h_t = f_act(Whi.dot(x_t) + Whh.dot(h_t_prev) + bh) o_t = Woh.dot(h_t) + bo h_t = (1.0 - dropout_rate) * h_t return [o_t, h_t] [o_train, _], _ = theano.scan(masked_forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) [o_test, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) else: def forward_prop_step(x_t, h_t_prev): h_t = f_act(Whi.dot(x_t) + Whh.dot(h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] [o_train, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) o_test = o_train if obj == 'c': # classification task self.y = T.bscalar('y') self.o_train = f_out(o_train[-1]) self.o_test = f_out(o_test[-1]) #obj function to compute grad, use dropout self.cost = T.nnet.categorical_crossentropy( self.o_train, T.eye(n_out)[self.y]) #compute accuracy use average of dropout rate self.accuracy = T.switch(T.eq(T.argmax(self.o_test), self.y), 1., 0.) self.prediction = np.argmax(self.o_test) elif obj == 'r': # regression task self.y = T.dscalar('y') self.o_train = o_train[-1] self.o_test = o_test[-1] #obj function to compute grad, use dropout self.cost = (self.o_train[0] - self.y)**2 #compute accuracy use average of dropout rate self.accuracy = (self.o_test[0] - self.y)**2 self.prediction = self.o_test[0] _, self.Sigma, _ = T.nlinalg.SVD(full_matrices=1, compute_uv=1)(self.p[0]) self.max_singular = T.max(self.Sigma) self.min_singular = T.min(self.Sigma) self.optimiser = sgd_optimizer(self, 'RNN')
def __init__(self, rng, n_in, n_out, n_h, n_r, margin=1.0, sig_mean=1.0, f_act=leaky_relu, f_out=softmax, obj='c'): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param n_r: Number of reflection vectors (int) :param f_act: Hidden-to-hidden activation function :param f_out: Output activation function :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r']) ''' U_ = np.tril(rng.normal(0, 0.01, (n_h, n_r))) norms_U_ = np.linalg.norm(U_, axis=0) U_ = 1. / norms_U_ * U_ V_ = np.tril(rng.normal(0, 0.01, (n_h, n_r))) norms_V_ = np.linalg.norm(V_, axis=0) V_ = 1. / norms_V_ * V_ #Sig_ = np.ones( n_h) P_ = np.zeros(n_h) Whi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) bh_ = np.zeros(n_h) Woh_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)), np.sqrt(6. / (n_h + n_out)), (n_out, n_h)) bo_ = np.zeros(n_out) h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) # Theano: Created shared variables Whi = theano.shared(name='Whi', value=Whi_.astype(theano.config.floatX)) U = theano.shared(name='U', value=U_.astype(theano.config.floatX)) V = theano.shared(name='V', value=V_.astype(theano.config.floatX)) #Sig = theano.shared(name='Sig', value=Sig_.astype(theano.config.floatX)) P = theano.shared(name='P', value=P_.astype(theano.config.floatX)) bh = theano.shared(name='bh', value=bh_.astype(theano.config.floatX)) Woh = theano.shared(name='Woh', value=Woh_.astype(theano.config.floatX)) bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX)) h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX)) #self.p = [U, V, Sig, Whi, Woh, bh, bo, h0] self.p = [U, V, P, Whi, Woh, bh, bo, h0] seq_len = T.iscalar('seq_len') self.seq_len = seq_len self.x = T.vector() #x_scan = T.shape_padright(self.x) x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2) if n_h != n_r: # Number of reflection vectors is less than the hidden dimension def forward_prop_step(x_t, h_t_prev): Sig = 2 * margin * (sigmoid(P) - 0.5) + sig_mean h_t = f_act(Whi.dot(x_t) + svd_H_wy(U, V, Sig, h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] else: def forward_prop_step(x_t, h_t_prev): Sig = 2 * margin * (sigmoid(P) - 0.5) + sig_mean Hu1SigHv1 = T.set_subtensor(Sig[-1], Sig[-1] * U[-1, -1] * V[-1, -1]) h_t = f_act( Whi.dot(x_t) + svd_H_wy(U[:, :-1], V[:, :-1], Hu1SigHv1, h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] [o_scan, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) if obj == 'c': # classification task self.y = T.bscalar('y') self.o = f_out(o_scan[-1]) #obj function to compute grad, use dropout self.cost = T.nnet.categorical_crossentropy( self.o, T.eye(n_out)[self.y]) #compute accuracy use average of dropout rate self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.) self.prediction = np.argmax(self.o) elif obj == 'r': # regression task self.y = T.dscalar('y') self.o = o_scan[-1] #obj function to compute grad, use dropout self.cost = (self.o[0] - self.y)**2 #compute accuracy use average of dropout rate self.accuracy = (self.o[0] - self.y)**2 self.prediction = self.o[0] self.max_singular = 2 * margin * (sigmoid(T.max(self.p[2])) - 0.5) + sig_mean self.min_singular = 2 * margin * (sigmoid(T.min(self.p[2])) - 0.5) + sig_mean self.optimiser = sgd_optimizer(self, 'svdRNN')
def __init__(self, n_features): self.n_features = n_features self.x = T.fvector("x") self.y = T.bscalar("y") self.W = theano.shared(rng.randn(n_features).astype(theano.config.floatX), name="W") self.b = theano.shared(numpy.asarray(0., dtype=theano.config.floatX), name="b")
valid_x = theano.shared(valid_x, borrow=True) train_y = T.cast(theano.shared(train_y, borrow=True), dtype='int32') valid_y = T.cast(theano.shared(valid_y, borrow=True), dtype='int32') # allocate learning rate and momentum shared variables learning_rate = theano.shared( np.array(learning_rate_schedule[0], dtype=theano.config.floatX)) momentum = theano.shared( np.array(momentum_schedule[0], dtype=theano.config.floatX)) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of [int] labels dropout_active = T.bscalar( 'dropout_active') # a flag to enable and disable dropout ###################### # BUILD ACTUAL MODEL # ###################### print 'Building the model ...' # Reshape matrix of rasterized images of shape (batch_size, 48 * 48) # to a 4D tensor, compatible with our ConvPoolLayer layer0_input = x.reshape((batch_size, 1, 48, 48)) # layer10 = ConvPoolLayer( # rng, # input=layer0_input, # image_shape=(batch_size, 1, 98, 98), # filter_shape=(nkerns[0], 1, 4, 4),
max_egs=int(opts.num_from_train), parse_mode=opts.parse_mode) log("train_stats %s %s" % (len(train_x), train_stats)) dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab, update_vocab=False, max_egs=int(opts.num_from_dev), parse_mode=opts.parse_mode) log("dev_stats %s %s" % (len(dev_x), dev_stats)) # input/output example vars s1_idxs = T.ivector('s1') # sequence for sentence one s2_idxs = T.ivector('s2') # sequence for sentence two actual_y = T.ivector('y') # single for sentence pair label; 0, 1 or 2 # dropout keep prob for post concat, pre MLP apply_dropout = T.bscalar('apply_dropout') # dropout.{APPLY_DROPOUT|NO_DROPOUT} keep_prob = theano.shared(opts.keep_prob) # recall 1.0 => noop keep_prob = T.cast(keep_prob, 'float32') # shared weirdity, how to set in init (?) # keep track of different "layers" that handle their own gradients. # includes rnns, final concat & softmax and, potentially, special handling for # tied embeddings layers = [] # decide set of sequence idxs we'll be processing. there will always the two # for the forward passes over s1 and s2 and, optionally, two more for the # reverse pass over s1 & s2 in the bidirectional case. idxs = [s1_idxs, s2_idxs] names = ["s1f", "s2f"] if opts.bidirectional: idxs.extend([s1_idxs[::-1], s2_idxs[::-1]])
def __init__(self, rng, n_in, n_per_base, n_out, n_layer=1, basefuncs1=None, basefuncs2=None, gradient=None, with_shortcuts=False): """Initialize the parameters for the multilayer function graph :type rng: numpy.random.RandomState :param rng: a random number generator used to initialize weights :type n_in: int :param n_in: number of input units, the dimension of the space in which the datapoints lie :type n_layer: int :param n_layer: number of hidden layers :type n_per_base: int :param n_per_base: number of nodes per basis function see FGLayer :type n_out: int :param n_out: number of output units, the dimension of the space in which the labels lie :type basefuncs1: [int] :param basefuncs1: see FGLayer :type basefuncs2: [int] :param basefuncs2: see FGLayer :type gradient: string :param gradient: type of gradient descent algo (None=="sgd+","adagrad","adadelta","nag") :type with_shortcuts: bool :param with_shortcuts: whether to use shortcut connections (output is connected to all units) """ self.input = T.matrix('input') # the data is presented as vector input self.labels = T.matrix( 'labels') # the labels are presented as vector of continous values self.rng = rng self.n_layers = n_layer self.hidden_layers = [] self.params = [] self.n_in = n_in self.n_out = n_out self.with_shortcuts = with_shortcuts self.fixL0 = False for l in xrange(n_layer): if l == 0: layer_input = self.input n_input = n_in else: layer_input = self.hidden_layers[l - 1].output n_input = self.hidden_layers[l - 1].n_out hiddenLayer = FGLayer( rng=rng, inp=layer_input, n_in=n_input, n_per_base=n_per_base, basefuncs1=basefuncs1, basefuncs2=basefuncs2, layer_idx=l, ) self.hidden_layers.append(hiddenLayer) self.params.extend(hiddenLayer.params) div_thresh = T.scalar("div_thresh") # The linear output layer, either it gets as input the output of ALL previous layers if self.with_shortcuts: output_layer_inp = T.concatenate( [l.output for l in reversed(self.hidden_layers)], axis=1) output_layer_n_in = sum([l.n_out for l in self.hidden_layers]) else: # or just of the last hidden layer output_layer_inp = self.hidden_layers[-1].output output_layer_n_in = self.hidden_layers[-1].n_out self.output_layer = DivisionRegression(rng=rng, inp=output_layer_inp, n_in=output_layer_n_in, n_out=n_out, div_thresh=div_thresh) self.params.extend(self.output_layer.params) self.evalfun = theano.function( inputs=[self.input, In(div_thresh, value=0.0001)], outputs=self.output_layer.output) L1_reg = T.scalar('L1_reg') L2_reg = T.scalar('L2_reg') fixL0 = T.bscalar('fixL0') self.L1 = self.output_layer.L1 + sum( [l.L1 for l in self.hidden_layers]) self.L2_sqr = self.output_layer.L2_sqr + sum( [l.L2_sqr for l in self.hidden_layers]) self.penalty = self.output_layer.penalty self.loss = self.output_layer.loss self.errors = self.loss self.cost = (self.loss(self.labels) + L1_reg * self.L1 + L2_reg * self.L2_sqr + self.penalty) #Extrapol penalty self.extrapol_cost = self.output_layer.extrapol_loss learning_rate = T.scalar('learning_rate') def process_updates(par, newp): # print par.name if par.name == "W": # if fixL0 is True, then keep small weights at 0 return par, ifelse( fixL0, T.switch(T.abs_(par) < 0.001, par * 0, newp), newp) return par, newp print "Gradient:", gradient update = None if gradient == 'sgd+' or gradient == 'sgd' or gradient == None: gparams = [T.grad(self.cost, param) for param in self.params] update = OrderedDict([ (param, param - (learning_rate * gparam).clip(-1.0, 1.0)) for param, gparam in zip(self.params, gparams) ]) elif gradient == 'adam': update = Lupdates.adam(self.cost, self.params, learning_rate, epsilon=1e-04) elif gradient == 'adadelta': update = Lupdates.adadelta(self.cost, self.params, learning_rate) elif gradient == 'rmsprop': update = Lupdates.rmsprop(self.cost, self.params, learning_rate) elif gradient == 'nag': update = Lupdates.nesterov_momentum(self.cost, self.params, learning_rate) else: assert ("unknown gradient " + gradient) #Extrapol sanity gradient computation: extrapol_updates = Lupdates.adam(self.extrapol_cost, self.params, learning_rate, epsilon=1e-04) updates = [process_updates(*up) for up in update.items()] self.train_model = theano.function( inputs=[ self.input, self.labels, L1_reg, L2_reg, fixL0, learning_rate, div_thresh ], outputs=self.cost, updates=updates, ) # avoid too large outputs in extrapolation domain self.remove_extrapol_error = theano.function( inputs=[self.input, learning_rate, div_thresh], outputs=self.extrapol_cost, updates=extrapol_updates, ) self.test_model = theano.function( inputs=[self.input, self.labels, In(div_thresh, value=0.0001)], outputs=self.errors(self.labels), ) self.validate_model = theano.function( inputs=[self.input, self.labels, In(div_thresh, value=0.0001)], outputs=self.errors(self.labels), ) self.L1_loss = theano.function( inputs=[], outputs=self.L1, ) self.MSE = theano.function( inputs=[self.input, self.labels, In(div_thresh, value=0.0001)], outputs=self.errors(self.labels), )
def __init__(self, dataset, learning_rate=0.001, decrease_constant=0, hidden_sizes=[500], random_seed=1234, batch_size=1, hidden_activation=T.nnet.sigmoid, use_cond_mask=False, direct_input_connect="None", direct_output_connect=False, update_rule="None", dropout_rate=0, weights_initialization="Uniform", mask_distribution=0): input_size = dataset['input_size'] self.shuffled_once = False self.seed_generator = SeedGenerator(random_seed) self.trng = T.shared_randomstreams.RandomStreams(self.seed_generator.get()) # Get the weights initializer by string name weights_initialization = getattr( WeightsInitializer(self.seed_generator.get()), weights_initialization) # Building the model's graph input = T.matrix(name="input") target = T.matrix(name="target") is_train = T.bscalar(name="is_train") # Initialize the mask self.mask_generator = MaskGenerator( input_size, hidden_sizes, mask_distribution, self.seed_generator.get()) # Initialize layers input_layer = ConditionningMaskedLayer(layerIdx=0, input=input, n_in=input_size, n_out=hidden_sizes[0], activation=hidden_activation, weights_initialization=weights_initialization, mask_generator=self.mask_generator, use_cond_mask=use_cond_mask) self.layers = [dropoutLayerDecorator(input_layer, self.trng, is_train, dropout_rate)] # Now the hidden layers for i in range(1, len(hidden_sizes)): previous_layer = self.layers[i - 1] hidden_layer = DirectInputConnectConditionningMaskedLayer(layerIdx=i, input=previous_layer.output, n_in=hidden_sizes[i - 1], n_out=hidden_sizes[i], activation=hidden_activation, weights_initialization=weights_initialization, mask_generator=self.mask_generator, use_cond_mask=use_cond_mask, direct_input=input if direct_input_connect == "Full" and previous_layer.output != input else None) self.layers += [dropoutLayerDecorator(hidden_layer, self.trng, is_train, dropout_rate)] # And the output layer outputLayerIdx = len(self.layers) previous_layer = self.layers[outputLayerIdx - 1] self.layers += [DirectOutputInputConnectConditionningMaskedOutputLayer(layerIdx=outputLayerIdx, input=previous_layer.output, n_in=hidden_sizes[ outputLayerIdx - 1], n_out=input_size, activation=T.nnet.sigmoid, weights_initialization=weights_initialization, mask_generator=self.mask_generator, use_cond_mask=use_cond_mask, direct_input=input if ( direct_input_connect == "Full" or direct_input_connect == "Output") and previous_layer.output != input else None, direct_outputs=[(layer.layer_idx, layer.n_in, layer.input) for layerIdx, layer in enumerate(self.layers[1:-1])] if direct_output_connect else [])] # The loss function output = self.layers[-1].output pre_output = self.layers[-1].lin_output log_prob = - \ T.sum(T.nnet.softplus(-target * pre_output + (1 - target) * pre_output), axis=1) # log_prob = T.sum(target * T.log(output) + (1 - target) * T.log(1 - output), axis=1) loss = (-log_prob).mean() # How to update the parameters self.parameters = [param for layer in self.layers for param in layer.params] parameters_gradient = T.grad(loss, self.parameters) # Initialize update_rule if update_rule == "None": self.update_rule = DecreasingLearningRate(learning_rate, decrease_constant) elif update_rule == "adadelta": self.update_rule = AdaDelta(decay=decrease_constant, epsilon=learning_rate) elif update_rule == "adagrad": self.update_rule = AdaGrad(learning_rate=learning_rate) elif update_rule == "rmsprop": self.update_rule = RMSProp(learning_rate=learning_rate, decay=decrease_constant) elif update_rule == "adam": self.update_rule = Adam(learning_rate=learning_rate) elif update_rule == "adam_paper": self.update_rule = Adam_paper(learning_rate=learning_rate) updates = self.update_rule.get_updates(list(zip(self.parameters, parameters_gradient))) # How to to shuffle weights masks_updates = [ layer_mask_update for layer in self.layers for layer_mask_update in layer.shuffle_update] self.update_masks = theano.function(name='update_masks', inputs=[], updates=masks_updates) # # Functions to train and use the model index = T.lscalar() self.learn = theano.function(name='learn', inputs=[index, is_train], outputs=loss, updates=updates, givens={input: dataset['train']['data'][ index * batch_size:(index + 1) * batch_size], target: dataset['train']['data'][index * batch_size:(index + 1) * batch_size]}, on_unused_input='ignore') # ignore for when dropout is absent self.use = theano.function(name='use', inputs=[input, is_train], outputs=output, on_unused_input='ignore') # ignore for when dropout is absent # Test functions self.valid_log_prob = theano.function(name='valid_log_prob', inputs=[is_train], outputs=log_prob, givens={ input: dataset['valid']['data'], target: dataset['valid']['data']}, on_unused_input='ignore') # ignore for when dropout is absent self.train_log_prob = theano.function(name='train_log_prob', inputs=[is_train], outputs=log_prob, givens={ input: dataset['train']['data'], target: dataset['train']['data']}, on_unused_input='ignore') # ignore for when dropout is absent self.train_log_prob_batch = theano.function(name='train_log_prob_batch', inputs=[index, is_train], outputs=log_prob, givens={input: dataset['train']['data'][ index * 1000:(index + 1) * 1000], target: dataset['train']['data'][index * 1000:(index + 1) * 1000]}, on_unused_input='ignore') # ignore for when dropout is absent self.test_log_prob = theano.function(name='test_log_prob', inputs=[is_train], outputs=log_prob, givens={ input: dataset['test']['data'], target: dataset['test']['data']}, on_unused_input='ignore') # ignore for when dropout is absent # Functions for verify gradient self.useloss = theano.function(name='useloss', inputs=[input, target, is_train], outputs=loss, on_unused_input='ignore') # ignore for when dropout is absent self.learngrad = theano.function(name='learn', inputs=[index, is_train], outputs=parameters_gradient, givens={input: dataset['train']['data'][ index * batch_size:(index + 1) * batch_size], target: dataset['train']['data'][index * batch_size:(index + 1) * batch_size]}, on_unused_input='ignore') # ignore for when dropout is absent # # adding functions to extract embeddings from each layer self.embedding_funcs = [theano.function(name='embedding-{}'.format(i), inputs=[input, is_train], outputs=layer.output, # givens={input: dataset['train']['data'][ # index * batch_size:(index + 1) * batch_size]}, on_unused_input='ignore') for i, layer in enumerate(self.layers[:-1])] # # NOTE: the predict method (for decoding) is possible only when there is no skip # connections to the output layer if direct_input_connect == 'None' and not direct_output_connect: print('No skip connections! defining decoding function') pred_threshold = T.vector() last_layer_embeddings = T.matrix(name="ll-embeddings") output_probs = T.matrix(name="output-probs") # T.dot(last_layer_embeddings, self.layers[-1].W) + self.layers[-1].b pred_probs = output predictions = T.switch(pred_probs < pred_threshold, 0, 1) thresholded_output = T.switch(output_probs < pred_threshold, 0, 1) self.predict_probs = theano.function(name='predict_probs', inputs=[last_layer_embeddings, is_train], outputs=pred_probs, givens={self.layers[-1].input: last_layer_embeddings}, on_unused_input='ignore') self.threshold_probs = theano.function(name='threshold_probs', inputs=[output_probs, pred_threshold], outputs=thresholded_output, on_unused_input='ignore') self.predict_func = theano.function(name='predict', inputs=[last_layer_embeddings, pred_threshold], outputs=predictions, givens={self.layers[-1].input: last_layer_embeddings}, on_unused_input='ignore') else: self.predict_func = None print('Skip connections detected! decoding will fail!')
"""Wrapper for creating Theano functions for training/inference mode""" import theano from theano import tensor as T # 1 = training mode # 0 = inference mode train_mode = T.bscalar('train_mode') def function(inputs=[], outputs=[], default_mode=0, **kwargs): inputs = list(inputs) outputs_list = list(outputs) if type(outputs) in (list, tuple) \ else [outputs] use_train_mode = train_mode in theano.gof.graph.ancestors( inputs + outputs_list) extra_args = [train_mode] if use_train_mode else [] f = theano.function( list(inputs)+extra_args, outputs, on_unused_input='warn', **kwargs) def g(*args): if default_mode is None: # args[-1] is the train_mode value if use_train_mode: # f() includes train_mode, pass arguments directly return f(*args) else: # f() does not include train_mode, drop the last argument return f(*(args[:-1]))
def __init__(self, layer_sizes, n_samples, alpha, learning_rate, v_prior, batch_size, X_train, y_train, N_train): layer_sizes = copy.copy(layer_sizes) layer_sizes[0] = layer_sizes[0] + 1 print layer_sizes self.batch_size = batch_size self.N_train = N_train self.X_train = X_train self.y_train = y_train self.rate = learning_rate # We create the network self.network = network.Network(layer_sizes, n_samples, v_prior, N_train) # index to a batch index = T.lscalar() self.indexes = T.vector('index', dtype='int32') indexes_train = theano.shared(value=np.array(range(0, N_train), dtype=np.int32), borrow=True) self.x = T.tensor3('x', dtype=theano.config.floatX) self.y = T.matrix('y', dtype=theano.config.floatX) self.lr = T.fscalar() # The logarithm of the values for the likelihood factors sampl = T.bscalar() self.fwpass = theano.function(outputs=self.network.output( self.x, False, samples=sampl, use_indices=False), inputs=[self.x, sampl], allow_input_downcast=True) ll_train = self.network.log_likelihood_values(self.x, self.y, self.indexes, 0.0, 1.0) self.estimate_marginal_ll = (-1.0 * N_train / (self.x.shape[ 1 ] * alpha) * \ T.sum(LogSumExp(alpha * (T.sum(ll_train, 2) - self.network.log_f_hat() - self.network.log_f_hat_z()), 0)+ \ T.log(1.0 / n_samples)) - self.network.log_normalizer_q() - 1.0 * N_train / self.x.shape[ 1 ] * self.network.log_normalizer_q_z() + \ self.network.log_Z_prior()) # We create a theano function for updating q upd = adam(self.estimate_marginal_ll, self.network.params, indexes_train[index * batch_size:(index + 1) * batch_size], self.rate, rescale_local=np.float32(N_train / batch_size)) self.process_minibatch = theano.function([ index], self.estimate_marginal_ll, \ updates = upd, \ givens = { self.x: T.tile(self.X_train[ index * batch_size: (index + 1) * batch_size] , [ n_samples, 1, 1 ]), self.y: self.y_train[ index * batch_size: (index + 1) * batch_size ], self.indexes: indexes_train[ index * batch_size : (index + 1) * batch_size ] }) # We create a theano function for making predictions self.error_minibatch_train = theano.function( [index], T.sum((T.mean( self.network.output(self.x, self.indexes), 0, keepdims=True)[0, :, :] - self.y)**2) / layer_sizes[-1], givens={ self.x: T.tile( self.X_train[index * batch_size:(index + 1) * batch_size], [n_samples, 1, 1]), self.y: self.y_train[index * batch_size:(index + 1) * batch_size], self.indexes: indexes_train[index * batch_size:(index + 1) * batch_size] }) self.ll_minibatch_train = theano.function([ index ], T.sum(LogSumExp(T.sum(ll_train, 2), 0) + T.log(1.0 / n_samples)), \ givens = { self.x: T.tile(self.X_train[ index * batch_size: (index + 1) * batch_size ], [ n_samples, 1, 1 ]), self.y: self.y_train[ index * batch_size: (index + 1) * batch_size ], self.indexes: indexes_train[ index * batch_size : (index + 1) * batch_size ] })
def __init__(self, rng, n_in, n_out, n_h, n_r, f_act=leaky_relu, f_out=softmax, obj='c'): ''' :param rng: Numpy RandomState :param n_in: Input dimension (int) :param n_out: Output dimension (int) :param n_h: Hidden dimension (int) :param n_r: Number of reflection vectors (int) :param f_act: Hidden-to-hidden activation function :param f_out: Output activation function :param obj: objective type, 'c' for classification with cross entropy loss, 'r' for regression with MSE loss. (['c','r']) ''' U_ = np.tril(rng.normal(0, 0.01, (n_h, n_r))) norms = np.linalg.norm(U_, axis=0) U_ = 1. / norms * U_ Whi_ = rng.uniform(-np.sqrt(6. / (n_in + n_h)), np.sqrt(6. / (n_in + n_h)), (n_h, n_in)) bh_ = np.zeros(n_h) Woh_ = rng.uniform(-np.sqrt(6. / (n_out + n_h)), np.sqrt(6. / (n_h + n_out)), (n_out, n_h)) bo_ = np.zeros(n_out) h0_ = rng.uniform(-np.sqrt(3. / (2. * n_h)), np.sqrt(3. / (2. * n_h)), n_h) # Theano: Created shared variables Whi = theano.shared(name='Whi', value=Whi_.astype(theano.config.floatX)) U = theano.shared(name='U', value=U_.astype(theano.config.floatX)) bh = theano.shared(name='bh', value=bh_.astype(theano.config.floatX)) Woh = theano.shared(name='Woh', value=Woh_.astype(theano.config.floatX)) bo = theano.shared(name='bo', value=bo_.astype(theano.config.floatX)) h0 = theano.shared(name='h0', value=h0_.astype(theano.config.floatX)) self.p = [U, Whi, Woh, bh, bo, h0] seq_len = T.iscalar('seq_len') self.seq_len = seq_len self.x = T.vector() #x_scan = T.shape_padright(self.x) x_scan = T.reshape(self.x, [seq_len, n_in], ndim=2) if n_h != n_r: # Number of reflection vectors is less than the hidden dimension def forward_prop_step(x_t, h_t_prev): h_t = f_act(Whi.dot(x_t) + H_wy(U, h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] else: def forward_prop_step(x_t, h_t_prev): h_t_prev = T.set_subtensor(h_t_prev[-1], h_t_prev[-1] * U[-1, -1]) h_t = f_act(Whi.dot(x_t) + H_wy(U[:, :-1], h_t_prev) + bh) o_t = Woh.dot(h_t) + bo return [o_t, h_t] ## For loop version below (when n_r < n_h) # def forward_prop_step(x_t, h_t_prev): # Wh = h_t_prev # for i in range(n_r): # Wh -= 2. * U[:, n_r - i - 1] * T.dot(U[:, n_r - i - 1], Wh) # h_t = f_act(Whi.dot(x_t) + Wh + bh) # o_t = Woh.dot(h_t) + bo # return [o_t, h_t] [o_scan, _], _ = theano.scan(forward_prop_step, sequences=[x_scan], outputs_info=[None, h0], n_steps=seq_len) if obj == 'c': # classification task self.y = T.bscalar('y') self.o = f_out(o_scan[-1]) #obj function to compute grad, use dropout self.cost = T.nnet.categorical_crossentropy( self.o, T.eye(n_out)[self.y]) #compute accuracy use average of dropout rate self.accuracy = T.switch(T.eq(T.argmax(self.o), self.y), 1., 0.) self.prediction = np.argmax(self.o) elif obj == 'r': # regression task self.y = T.dscalar('y') self.o = o_scan[-1] #obj function to compute grad, use dropout self.cost = (self.o[0] - self.y)**2 #compute accuracy use average of dropout rate self.accuracy = (self.o[0] - self.y)**2 self.prediction = self.o[0] self.optimiser = sgd_optimizer(self, 'oRNN')
def make_node(self, *inputs): inputs = [tt.as_tensor_variable(i) for i in inputs] outputs = [tt.bscalar()] return gof.Apply(self, inputs, outputs)
def optimiser(self, num_samples, update, update_kwargs, saved_update=None): overdisp_dim = T.bscalar('overdisp_dim') overdisp_s = T.bscalar('overdisp_s') batch = T.matrix('batch') batch_rep = T.repeat(batch, num_samples, axis=0) h_regular_rep = self.recognition_model.get_samples_latents_regular( batch_rep) h_overdisp = self.recognition_model.get_samples_latents_overdisp( batch, overdisp_dim) h_rep = T.set_subtensor( h_regular_rep[overdisp_s::num_samples, overdisp_dim], h_overdisp) log_p_h = self.generative_model.log_p_h(h_rep) log_p_x = self.generative_model.log_p_x(h_rep, batch_rep) entropies_h = self.recognition_model.entropies_latents( h_rep, batch_rep) log_w_rep = log_p_x + entropies_h + log_p_h log_w_matrix = log_w_rep.reshape((batch.shape[0], num_samples)) v = self.recognition_model.importance_weights_latents( h_overdisp, batch, overdisp_dim) log_u_matrix = T.repeat(v, num_samples).reshape( (batch.shape[0], num_samples)) + log_w_matrix log_u_rep = log_u_matrix.flatten() log_u_minus_max = log_u_matrix - T.max( log_u_matrix, axis=1, keepdims=True) u_matrix = T.exp(log_u_minus_max) u_normalized_matrix = u_matrix / T.sum(u_matrix, axis=1, keepdims=True) u_normalized_rep = T.reshape(u_normalized_matrix, log_w_rep.shape) params = self.generative_model.get_params( ) + self.recognition_model.get_params()[:-1] dummy_vec = T.vector(dtype=theano.config.floatX) grads = theano.clone(T.grad(-T.dot(log_u_rep, dummy_vec), params), replace={dummy_vec: u_normalized_rep}) tau = self.recognition_model.get_params()[-1] all_grads, _ = theano.scan( lambda s, log_u, u_norm: theano.clone( T.grad(-T.dot(log_u[s], dummy_vec), params), replace={dummy_vec: u_norm[s]}), sequences=[T.arange(log_u_matrix.T.shape[0])], non_sequences=[log_u_matrix.T, u_normalized_matrix.T], ) variance = T.sum([T.sum(T.var(g, axis=0)) for g in all_grads]) grad_tau = T.grad(variance, tau) grads += [grad_tau] params = self.generative_model.get_params( ) + self.recognition_model.get_params() update_kwargs['loss_or_grads'] = grads update_kwargs['params'] = params updates = update(**update_kwargs) if saved_update is not None: for u_matrix, v in zip(updates, saved_update.keys()): u_matrix.set_value(v.get_value()) optimiser = theano.function( inputs=[batch, overdisp_dim, overdisp_s], outputs=T.dot(log_u_rep, u_normalized_rep), updates=updates, ) return optimiser, updates