def __init__(self, numpy_rng=None, theano_rng=None, n_input=150, n_hidden=50, n_label=3, n_delay=6, freq=3): """ :type numpy_rng: np.random.RandomState :param numpy_rng: numpy random number generator used to draw initial weights :type theano_rng: theano.tensor.shared_randomstreams.RandomStreams :param theano_rng: Theano random generator; if None is given one is generated based on a seed drawn from `rng` :type n_input: int :param n_input: dimension of the input to the DBN :type n_hidden: int :param n_hidden: intermediate layer size :type n_label: int :param n_label: dimension of the output of the network (label layers) :type n_delay: int :param n_delay: number of past visible layer in the CRBM """ self.params = [] self.n_input = n_input self.n_hidden = n_hidden self.n_label = n_label self.delay = n_delay self.freq = freq if numpy_rng is None: # create a number generator numpy_rng = np.random.RandomState(1234) if not theano_rng: theano_rng = MRG_RandomStreams(numpy_rng.randint(2**30)) # allocate symbolic variables for the data self.x = T.matrix('x') # the data is presented as rasterized images self.x_history = T.matrix( 'x_history') #memory : past visible is a recopy of visible layer self.y = T.lvector( 'y' ) # the labels are presented as 1D vector of [int] labels (digit) # Construct an CRBM that shared weights with this layer self.crbm_layer = CRBM(numpy_rng=numpy_rng, theano_rng=theano_rng, input=self.x, input_history=self.x_history, n_visible=n_input, n_hidden=n_hidden, delay=n_delay, freq=freq) self.params.append(self.crbm_layer.W) self.params.append(self.crbm_layer.B) self.params.append(self.crbm_layer.hbias) # We now need to add a logistic layer on top of the MLP input_logistic = T.nnet.sigmoid( T.dot(self.x, self.crbm_layer.W) + T.dot(self.x_history, self.crbm_layer.B) + self.crbm_layer.hbias) self.logLayer = LogisticRegression(input=input_logistic, n_in=n_hidden, n_out=n_label) self.params.extend(self.logLayer.params) # compute the cost for second phase of training, defined as the # negative log likelihood of the logistic regression (output) layer self.finetune_cost = self.logLayer.negative_log_likelihood(self.y) # compute the gradients with respect to the model parameters # symbolic variable that points to the number of errors made on the # minibatch given by self.x and self.y self.errors = self.logLayer.errors(self.y)
def make_output(self, output, collapse=True, sample_mean=None, gamma=None): self.output = output if collapse and self.depth > 1: self.output = self.make_consensus(self.output) if self.attrs['consensus'] == 'flat': self.attrs['n_out'] *= self.depth if self.attrs['batch_norm']: self.output = self.batch_norm( self.output, self.attrs['n_out'], sample_mean=sample_mean, gamma=gamma, use_sample=self.attrs['bn_use_sample']) if self.attrs['residual']: from .hidden import concat_sources z, n_in = concat_sources(self.sources, unsparse=True, expect_source=False) assert n_in == self.attrs['n_out'] self.output += z if self.attrs['layer_drop'] > 0.0: # Stochastic Depth, http://arxiv.org/abs/1603.09382 from .hidden import concat_sources z, n_in = concat_sources(self.sources, unsparse=True, expect_source=False) n_out = self.attrs['n_out'] if n_in != n_out: print("Layer drop with additional projection %i -> %i" % (n_in, n_out), file=log.v4) if n_in > 0: self.W_drop = self.add_param( self.create_forward_weights(n_in, n_out, name="W_drop_%s" % self.name)) z = T.dot(z, self.W_drop) else: z = 0 if self.train_flag: from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams rng = RandomStreams(self.rng.randint(1234) + 1) import theano.ifelse drop = rng.binomial(n=1, p=self.attrs['layer_drop'], size=(1, ), dtype='int8')[0] # drop = theano.printing.Print("drop")(drop) self.output = theano.ifelse.ifelse(drop, z, self.output) else: drop = self.attrs['layer_drop'] self.output = numpy.float32(drop) * z + numpy.float32( 1.0 - drop) * self.output if self.attrs['sparse']: self.output = T.argmax(self.output, axis=-1, keepdims=True) if self.attrs['sparse_filtering']: # https://dlacombejr.github.io/programming/2015/09/13/sparse-filtering-implemenation-in-theano.html fs = T.sqrt(self.output**2 + 1e-8) # numerical stability l2fs = T.sqrt(T.sum(fs**2, axis=1)) # l2 norm of row nfs = fs / l2fs.dimshuffle(0, 'x') # normalize rows l2fn = T.sqrt(T.sum(nfs**2, axis=0)) # l2 norm of column self.output = nfs / l2fn.dimshuffle('x', 0) # normalize columns self.output.name = "%s.output" % self.name self._output = output
y = state.get_value() assert is_binary(y) s = y.sum(axis=1) assert np.all(s == 1) validate_all_samples() if vis_sample.ndim == 4: vis_sample.set_value(vis_batch) else: vis_sample.set_value(dataset.get_design_matrix(vis_batch)) validate_all_samples() theano_rng = MRG_RandomStreams(2012 + 9 + 18) # Do one round of clamped sampling so the seed data gets to have an influence # The sampling is bottom-to-top so if we don't do an initial round where we # explicitly clamp vis_sample, its initial value gets discarded with no influence sampling_updates = model.get_sampling_updates( layer_to_state, theano_rng, layer_to_clamp={model.visible_layer: True}) t1 = time.time() sample_func = function([], updates=sampling_updates) t2 = time.time() print 'Clamped sampling function compilation took', t2 - t1 sample_func() # Now compile the full sampling update sampling_updates = model.get_sampling_updates(layer_to_state, theano_rng)
def __init__(self, incoming, encoder, decoder, x_distribution='bernoulli', pz_distribution='gaussian', qz_distribution='gaussian', latent_size=50, W=init.Normal(0.01), b=init.Normal(0.01), **kwargs): super(VAELayer, self).__init__(incoming, **kwargs) num_batch, n_features = self.input_shape self.num_batch = num_batch self.n_features = n_features self.x_distribution = x_distribution self.pz_distribution = pz_distribution self.qz_distribution = qz_distribution self.encoder = encoder self.decoder = decoder self._srng = RandomStreams() if self.x_distribution not in ['gaussian', 'bernoulli']: raise NotImplementedError if self.pz_distribution not in ['gaussian', 'gaussianmarg']: raise NotImplementedError if self.qz_distribution not in ['gaussian', 'gaussianmarg']: raise NotImplementedError self.params_encoder = lasagne.layers.get_all_params(encoder) self.params_decoder = lasagne.layers.get_all_params(decoder) for p in self.params_encoder: p.name = "VAELayer encoder :" + p.name for p in self.params_decoder: p.name = "VAELayer decoder :" + p.name self.num_hid_enc = encoder.output_shape[1] self.num_hid_dec = decoder.output_shape[1] self.latent_size = latent_size self.W_enc_to_z_mu = self.add_param(W, (self.num_hid_enc, latent_size)) self.b_enc_to_z_mu = self.add_param(b, (latent_size, )) self.W_enc_to_z_logsigma = self.add_param( W, (self.num_hid_enc, self.latent_size)) self.b_enc_to_z_logsigma = self.add_param(b, (latent_size, )) self.W_dec_to_x_mu = self.add_param( W, (self.num_hid_dec, self.n_features)) self.b_dec_to_x_mu = self.add_param(b, (self.n_features, )) self.W_params = [ self.W_enc_to_z_mu, self.W_enc_to_z_logsigma, self.W_dec_to_x_mu ] + self.params_encoder + self.params_decoder self.bias_params = [ self.b_enc_to_z_mu, self.b_enc_to_z_logsigma, self.b_dec_to_x_mu ] params_tmp = [] if self.x_distribution == 'gaussian': self.W_dec_to_x_logsigma = self.add_param( W, (self.num_hid_dec, self.n_features)) self.b_dec_to_x_logsigma = self.add_param(b, (self.n_features, )) self.W_params += [self.W_dec_to_x_logsigma] self.bias_params += [self.b_dec_to_x_logsigma] self.W_dec_to_x_logsigma.name = "VAE: W_dec_to_x_logsigma" self.b_dec_to_x_logsigma.name = "VAE: b_dec_to_x_logsigma" params_tmp = [self.W_dec_to_x_logsigma, self.b_dec_to_x_logsigma] self.params = self.params_encoder + [self.W_enc_to_z_mu, self.b_enc_to_z_mu, self.W_enc_to_z_logsigma, self.b_enc_to_z_logsigma] + self.params_decoder + \ [self.W_dec_to_x_mu, self.b_dec_to_x_mu] + params_tmp self.W_enc_to_z_mu.name = "VAELayer: W_enc_to_z_mu" self.W_enc_to_z_logsigma.name = "VAELayer: W_enc_to_z_logsigma" self.W_dec_to_x_mu.name = "VAELayer: W_dec_to_x_mu" self.b_enc_to_z_mu.name = "VAELayer: b_enc_to_z_mu" self.b_enc_to_z_logsigma.name = "VAELayer: b_enc_to_z_logsigma" self.b_dec_to_x_mu.name = "VAELayer: b_dec_to_x_mu"
def __init__(self, incomings, **kwargs): super(Q_Layer, self).__init__(incomings, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579))
def random_normal(shape, mean=0.0, std=1.0, dtype=_FLOATX, seed=None): if seed is None: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) return rng.normal(size=shape, avg=mean, std=std, dtype=dtype)
class VAE: def __init__(self, n_in, n_hidden, n_out, n_hidden_decoder=None, trans_func=rectify, batch_size=100): self.n_in = n_in self.n_hidden = n_hidden self.n_out = n_out self.batch_size = batch_size self.transf = trans_func self.l_in = InputLayer(shape=(batch_size, n_in)) self.srng = RandomStreams() l_in_encoder = lasagne.layers.InputLayer(shape=(batch_size, n_in)) l_in_decoder = lasagne.layers.InputLayer(shape=(batch_size, n_out)) l_prev_encoder = l_in_encoder l_prev_decoder = l_in_decoder for i in range(len(n_hidden)): l_tmp_encoder = lasagne.layers.DenseLayer(l_prev_encoder, num_units=n_hidden[i], W=lasagne.init.Uniform(), nonlinearity=self.transf) l_prev_encoder = l_tmp_encoder if n_hidden_decoder is None: n_hidden_decoder = n_hidden self.n_hidden_decoder = n_hidden_decoder for i in range(len(n_hidden_decoder)): l_tmp_decoder = lasagne.layers.DenseLayer( l_prev_decoder, num_units=n_hidden_decoder[-(i + 1)], W=lasagne.init.Uniform(), nonlinearity=self.transf) l_prev_decoder = l_tmp_decoder l_in = lasagne.layers.InputLayer(shape=(batch_size, n_in)) self.model = VAELayer( l_in, encoder=l_prev_encoder, decoder=l_prev_decoder, latent_size=n_out, x_distribution='bernoulli', qz_distribution='gaussianmarg', #gaussianmarg pz_distribution='gaussianmarg') self.x = T.matrix('x') def build_model(self, train_x, test_x, valid_x, update, update_args): self.train_x = train_x self.test_x = test_x self.validation_x = valid_x self.update = update self.update_args = update_args self.index = T.iscalar('index') self.batch_slice = slice(self.index * self.batch_size, (self.index + 1) * self.batch_size) x = self.srng.binomial(size=self.x.shape, n=1, p=self.x) log_pz, log_qz_given_x, log_px_given_z = self.model.get_log_distributions( self.x) loss_eval = (log_pz + log_px_given_z - log_qz_given_x).sum() loss_eval /= self.batch_size all_params = get_all_params(self.model) updates = self.update(-loss_eval, all_params, *self.update_args) train_model = theano.function( [self.index], loss_eval, updates=updates, givens={ self.x: self.train_x[self.batch_slice], }, ) test_model = theano.function( [self.index], loss_eval, givens={ self.x: self.test_x[self.batch_slice], }, ) validate_model = theano.function( [self.index], loss_eval, givens={ self.x: self.validation_x[self.batch_slice], }, ) return train_model, test_model, validate_model def draw_sample(self, z): return self.model.draw_sample(z) def get_output(self, dat): z, _, _ = self.model.get_z_mu_sigma(dat) return z def get_reconstruction(self, z): return self.model.decoder_output(z)
# print data.shape # mask = numpy.zeros((90,90),dtype=numpy.float32) # mask[30:,:]=1. # train_features_numpy = data*mask.reshape(1,1,1,90,90) #labels = numpy.argmax(datafile.root.yd.read(),1).astype(numpy.int32).reshape(-1,) labels = datafile.root.y.read() if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) print '... instantiating model' numpy_rng = numpy.random.RandomState(1) theano_rng = MRG_RandomStreams(numpy_rng.randint(2**15)) x = T.matrix('x').reshape((batchsize, 2, 5, 90, 90)) y = T.matrix('y') model = my_network( numpy_rng=numpy_rng, theano_rng=theano_rng, input=x, labels=y, Wl_path= '/home/konda/software/python_env/bin/project_odometry/Wl4CNN256Cr.npy', Wr_path= '/home/konda/software/python_env/bin/project_odometry/Wr4CNN256Cr.npy', image_shape=[8, 90, 90, batchsize], fsi=[8, 16, 16, n_filters],
def main(gan, optimizer, do_batch_norm, n_epochs, epoch_size, batch_size, initial_eta, eta_decay, threshold, activation, noise_type, dump): # Load the dataset print("Loading data...") X_train, y_train, X_val, y_val, X_test, y_test = load_dataset() if threshold != 0.0: X_train[X_train >= threshold] = 1 X_train[X_train < threshold] = 0 X_test[X_test >= threshold] = 1 X_test[X_test < threshold] = 0 # Instantiate a symbolic noise generator to use for training from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(seed=np.random.randint(2147462579, size=6)) if noise_type == 'normal': noise = srng.normal((batch_size, 100), avg=0.0, std=1) elif noise_type == 'uniform': noise = srng.uniform((batch_size, 100)) else: raise Exception("Noise {} not supported".format(noise_type)) # Prepare Theano variables for inputs and targets noise_var = T.matrix('noise') input_var = T.tensor4('inputs') # Create neural network model print("Building model and compiling functions...") generator = build_generator(noise_var, do_batch_norm, activation) critic = build_critic(gan, input_var, do_batch_norm) # Create expression for passing real data through the critic fake_in = lasagne.layers.get_output(generator) real_out = lasagne.layers.get_output(critic) # Create expression for passing fake data through the critic fake_out = lasagne.layers.get_output(critic, fake_in) # Create loss expressions if gan == 'dcgan': # Create loss expressions generator_loss = lasagne.objectives.binary_crossentropy(fake_out, 1) generator_loss = generator_loss.mean() critic_loss = (lasagne.objectives.binary_crossentropy(real_out, 1) + lasagne.objectives.binary_crossentropy(fake_out, 0)) critic_loss = critic_loss.mean() elif gan == 'lsgan': # a, b, c = -1, 1, 0 # Equation (8) in the paper a, b, c = 0, 1, 1 # Equation (9) in the paper generator_loss = lasagne.objectives.squared_error(fake_out, c).mean() critic_loss = (lasagne.objectives.squared_error(real_out, b).mean() + lasagne.objectives.squared_error(fake_out, a).mean()) elif gan in ('wgan', 'wgan-gp'): # original in Jan's code # generator_loss = fake_out.mean() # critic_loss = real_out.mean() - fake_out.mean() generator_loss = -fake_out.mean() critic_loss = -real_out.mean() + fake_out.mean() if gan == 'wgan-gp': # gradient penalty alpha = srng.uniform((batch_size, 1, 1, 1), low=0., high=1.) differences = fake_in - input_var interpolates = input_var + (alpha * differences) inter_out = lasagne.layers.get_output(critic, interpolates) gradients = theano.grad(inter_out.sum(), wrt=interpolates) slopes = T.sqrt(T.sum(T.sqr(gradients), axis=(1, 2, 3))) critic_penalty = 10 * T.mean((slopes - 1.)**2) # original in Jan's code # critic_loss -= critic_penalty critic_loss += critic_penalty else: raise Exception("GAN {} is not supported".format(gan)) # Create update expressions for training generator_params = lasagne.layers.get_all_params(generator, trainable=True) critic_params = lasagne.layers.get_all_params(critic, trainable=True) eta = theano.shared(lasagne.utils.floatX(initial_eta)) # choose the optimizer if optimizer == 'adam': generator_updates = lasagne.updates.adam(generator_loss, generator_params, learning_rate=eta, beta1=0.5, beta2=0.9) critic_updates = lasagne.updates.adam(critic_loss, critic_params, learning_rate=eta, beta1=0.5, beta2=0.9) elif optimizer == 'rmsprop': generator_updates = lasagne.updates.rmsprop(generator_loss, generator_params, learning_rate=eta) critic_updates = lasagne.updates.rmsprop(critic_loss, critic_params, learning_rate=eta) # Compile functions performing a training step on a mini-batch (according # to the updates dictionary) and returning the corresponding loss: generator_train_fn = theano.function([], generator_loss, givens={noise_var: noise}, updates=generator_updates) critic_train_fn = theano.function([input_var], critic_loss, givens={noise_var: noise}, updates=critic_updates) # Compile another function generating some data gen_fn = theano.function([noise_var], lasagne.layers.get_output(generator, deterministic=True)) # Finally, launch the training loop. print("Starting training...") # We create an infinite supply of batches (as an iterable generator): batches = iterate_minibatches(X_train, y_train, batch_size, shuffle=True, forever=True) # build preffix and suffix str for saving files prefix = "{}_mnist".format(gan) suffix = "non_lin_{}_opt_{}_bn_{}_etadecay_{}_thresh_{}_noise_{}".format( activation, optimizer, do_batch_norm, eta_decay, threshold, noise_type) # We iterate over epochs: n_generator_updates = 0 for epoch in tqdm(range(n_epochs)): # sample a batch of samples, plot them inc. histograms n_samples = 1000 samples = gen_fn(lasagne.utils.floatX(np.random.rand(n_samples, 100))) plot_samples( gan, samples, "samples/{}_samples_{}_{}.png".format(prefix, epoch, suffix)) plot_histogram( gan, samples, X_train, "{} : {} {}".format(gan, optimizer, epoch), "histograms/{}_hist_epoch_{}_{}.png".format(prefix, epoch, suffix)) critic_scores = [] generator_scores = [] for _ in range(epoch_size): for _ in range(get_critic_runs(gan, n_generator_updates)): inputs, targets = next(batches) critic_scores.append(critic_train_fn(inputs)) generator_scores.append(generator_train_fn()) n_generator_updates += 1 print(" generator loss:\t\t{}".format(np.mean(generator_scores))) print(" critic loss:\t\t{}".format(np.mean(critic_scores))) # After half the epochs, we start decaying the learn rate towards zero if eta_decay and epoch >= int(n_epochs / 2): progress = float(epoch) / n_epochs eta.set_value( lasagne.utils.floatX(initial_eta * 2 * (1 - progress))) # dump the network weights to a file: if dump: np.savez('models/{}_mnist_gen_{}.npz'.format(gan, suffix), *lasagne.layers.get_all_param_values(generator)) np.savez('models/{}_mnist_crit_{}.npz'.format(gan, suffix), *lasagne.layers.get_all_param_values(critic))
def main(num_epochs=200, convs=0, batchsize=64, initial_eta=5e-3, add_noise=True): # Load the dataset print("Loading data...") datapath = '/media/steampunkhd/rafaelvalle/datasets/MIDI/Piano' glob_file_str = '*.npy' n_pieces = 0 # 0 is equal to all pieces, unbalanced dataset crop = None # (32, 96) as_dict = False inputs, _ = load_data(datapath, glob_file_str, n_pieces, crop, as_dict) # scale to [0, 1] # inputs = (inputs + 1) * 0.5 # Prepare Theano variables for inputs and targets noise_var = T.matrix('noise') input_var = T.tensor4('inputs') # Instantiate a symbolic noise generator to use for training from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(seed=np.random.randint(2147462579, size=6)) # Create neural network model print("Building model and compiling functions...") generator = build_generator(noise_var, convs) discriminator = build_discriminator(input_var, convs) # Create expression for passing real data through the discriminator real_out = lasagne.layers.get_output(discriminator) # Create expression for passing fake data through the discriminator fake_out = lasagne.layers.get_output(discriminator, lasagne.layers.get_output(generator)) # Create loss expressions # one-sided label smoothing lbl_noise = 0.0 if add_noise: lbl_noise = srng.normal(size=(3, ), avg=0.0, std=0.1) generator_loss = lasagne.objectives.binary_crossentropy(fake_out, 1).mean() discriminator_loss = ( lasagne.objectives.binary_crossentropy(real_out, 1 + lbl_noise) + lasagne.objectives.binary_crossentropy(fake_out, 0)).mean() # Create update expressions for training generator_params = lasagne.layers.get_all_params(generator, trainable=True) discriminator_params = lasagne.layers.get_all_params(discriminator, trainable=True) eta = theano.shared(lasagne.utils.floatX(initial_eta)) updates = lasagne.updates.adam(generator_loss, generator_params, learning_rate=eta, beta1=0.9) updates.update( lasagne.updates.adam(discriminator_loss, discriminator_params, learning_rate=eta, beta1=0.9)) noise = srng.uniform((batchsize, 100)) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var], [(real_out > .5).mean(), (fake_out < .5).mean()], givens={noise_var: noise}, updates=updates) # Compile another function generating some data gen_fn = theano.function([noise_var], lasagne.layers.get_output(generator, deterministic=True)) obs_length = 128 print("Starting training...") for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(inputs, batchsize, length=obs_length): batch = lasagne.utils.floatX(batch) # reshape batch to proper dimensions batch = batch.reshape( (batch.shape[0], 1, batch.shape[1], batch.shape[2])) train_err += np.array(train_fn(batch)) train_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{}".format(train_err / train_batches)) # And finally, we plot some generated data samples = gen_fn(lasagne.utils.floatX(np.random.rand(42, noise_size))) plt.imsave( 'images/dcgan_proll/proll_samples_epoch{}.png'.format(epoch), (samples.reshape(6, 7, obs_length, obs_length).transpose( 0, 2, 1, 3).reshape(6 * obs_length, 7 * obs_length)).T, cmap='gray', origin='bottom') # After half the epochs, start decaying the learning rate towards zero if epoch >= num_epochs // 2: progress = float(epoch) / num_epochs eta.set_value( lasagne.utils.floatX(initial_eta * 2 * (1 - progress)))
def apply_dropout(computation_graph, variables, drop_prob, rng=None, seed=None, custom_divisor=None): """Apply dropout to specified variables in a graph. Parameters ---------- computation_graph : instance of :class:`ComputationGraph` The computation graph. variables : list of :class:`~tensor.TensorVariable` Variables to be dropped out. drop_prob : float Probability of dropping out. If you want to apply the dropout with different probabilities for different layers, call it several times. rng : :class:`~theano.sandbox.rng_mrg.MRG_RandomStreams` Random number generator. seed : int Random seed to be used if `rng` was not specified. custom_divisor : float or None, optional Divide dropped variables by a given scalar value. If `None`, (default) dropped variables will be divided by `(1 - drop_prob)` which is equivalent to scaling by `(1 - drop_prob)` at test time as recommended in [DROPOUT]_. Returns ------- dropped_computation_graph : instance of :class:`ComputationGraph` A new computation graph with dropout applied to the specified variables. In order to train with, or monitor, the outputs of the original computation graph with dropout applies, use the variables contained in `dropped_computation_graph.outputs`. Notes ----- For more information, see [DROPOUT]_. .. [DROPOUT] Hinton et al. *Improving neural networks by preventing co-adaptation of feature detectors*, arXiv:1207.0580. Examples -------- >>> import numpy >>> from theano import tensor, function >>> from blocks.bricks import MLP, Identity >>> from blocks.filter import VariableFilter >>> from blocks.initialization import Constant >>> from blocks.roles import INPUT >>> linear = MLP([Identity(), Identity()], [2, 10, 2], ... weights_init=Constant(1), biases_init=Constant(2)) >>> x = tensor.matrix('x') >>> y = linear.apply(x) >>> cg = ComputationGraph(y) We are going to drop out all the input variables >>> inputs = VariableFilter(roles=[INPUT])(cg.variables) Here we apply dropout with default setting to our computation graph >>> cg_dropout = apply_dropout(cg, inputs, 0.5) Dropped out variables have role `DROPOUT` and are tagged with `replacement_of` tag. Let's filter these variables and check if they have the links to original ones. >>> dropped_out = VariableFilter(roles=[DROPOUT])(cg_dropout.variables) >>> inputs_referenced = [var.tag.replacement_of for var in dropped_out] >>> set(inputs) == set(inputs_referenced) True Compiling theano functions to forward propagate in original and dropped out graphs >>> fprop = function(cg.inputs, cg.outputs[0]) >>> fprop_dropout = function(cg_dropout.inputs, cg_dropout.outputs[0]) Initialize an MLP and apply these functions >>> linear.initialize() >>> fprop(numpy.ones((3, 2), ... dtype=theano.config.floatX)) # doctest:+ELLIPSIS array([[ 42., 42.], [ 42., 42.], [ 42., 42.]]... >>> fprop_dropout(numpy.ones((3, 2), ... dtype=theano.config.floatX)) # doctest:+ELLIPSIS array([[ 0., 0.], [ 0., 0.], [ 0., 0.]]... And after the second run answer is different >>> fprop_dropout(numpy.ones((3, 2), ... dtype=theano.config.floatX)) # doctest:+ELLIPSIS array([[ 0., 52.], [ 100., 0.], [ 0., 0.]]... """ if not rng and not seed: seed = config.default_seed if not rng: rng = MRG_RandomStreams(seed) if custom_divisor is None: divisor = (1 - drop_prob) else: divisor = custom_divisor replacements = [(var, var * rng.binomial(var.shape, p=1 - drop_prob, dtype=theano.config.floatX) / divisor) for var in variables] for variable, replacement in replacements: add_role(replacement, DROPOUT) replacement.tag.replacement_of = variable return computation_graph.replace(replacements)
class SampleLayer(lasagne.layers.MergeLayer): """ Sampling layer supporting importance sampling as described in [BURDA]_ and multiple Monte Carlo samples for the approximation of E_q [log( p(x,z) / q(z|x) )]. Parameters ---------- mu : class:`Layer` instance Parameterizing the mean of the distribution to sample from as described in [BURDA]_. log_var : class:`Layer` instance By default assumed to parametrize log(sigma^2) of the distribution to sample from as described in [BURDA]_ which is transformed to sigma using the nonlinearity function as described below. Effectively this means that the nonlinearity function controls what log_var parametrizes. A few common examples: -nonlinearity = lambda x: T.exp(0.5*x) => log_var = log(sigma^2)[default] -nonlinearity = lambda x: T.sqrt(x) => log_var = sigma^2 -nonlinearity = lambda x: x => log_var = sigma eq_samples : int or T.scalar Number of Monte Carlo samples used to estimate the expectation over q(z|x) in eq. (8) in [BURDA]_. iw_samples : int or T.scalar Number of importance samples in the sum over k in eq. (8) in [BURDA]_. nonlinearity : callable or None The nonlinearity that is applied to the log_var input layer to transform it into a standard deviation. By default we assume that log_var = log(sigma^2) and hence the corresponding nonlinearity is f(x) = T.exp(0.5*x) such that T.exp(0.5*log(sigma^2)) = sigma seed : int seed to random stream Methods ---------- seed : Helper function to change the random seed after init is called References ---------- .. [BURDA] Burda, Yuri, Roger Grosse, and Ruslan Salakhutdinov. "Importance Weighted Autoencoders." arXiv preprint arXiv:1509.00519 (2015). """ def __init__(self, mean, log_var, eq_samples=1, iw_samples=1, nonlinearity=lambda x: T.exp(0.5*x), seed=lasagne.random.get_rng().randint(1, 2147462579), **kwargs): super(SampleLayer, self).__init__([mean, log_var], **kwargs) self.eq_samples = eq_samples self.iw_samples = iw_samples self.nonlinearity = nonlinearity self._srng = RandomStreams(seed) def seed(self, seed=lasagne.random.get_rng().randint(1, 2147462579)): self._srng.seed(seed) def get_output_shape_for(self, input_shapes): batch_size, num_latent = input_shapes[0] if isinstance(batch_size, int) and \ isinstance(self.iw_samples, int) and \ isinstance(self.eq_samples, int): out_dim = (batch_size*self.eq_samples*self.iw_samples, num_latent) else: out_dim = (None, num_latent) return out_dim def get_output_for(self, input, **kwargs): mu, log_var = input batch_size, num_latent = mu.shape eps = self._srng.normal( [batch_size, self.eq_samples, self.iw_samples, num_latent], dtype=theano.config.floatX) z = mu.dimshuffle(0,'x','x',1) + \ self.nonlinearity( log_var.dimshuffle(0,'x','x',1)) * eps return z.reshape((-1,num_latent))
def __init__(self, mean, log_var, seed=lasagne.random.get_rng().randint(1, 2147462579), **kwargs): super(SimpleSampleLayer, self).__init__([mean, log_var], **kwargs) self._srng = RandomStreams(seed)
Each column corresponds to a different unit Returns: dW: a matrix of the derivatives of the expected gradient of the energy """ raise NotImplementedError("TODO: implement this function.") if __name__ == "__main__": m = 2 nv = 3 nh = 4 h0 = T.alloc(1., m, nh) rng_factory = MRG_RandomStreams(42) W = rng_factory.normal(size=(nv, nh), dtype=h0.dtype) pv = T.nnet.sigmoid(T.dot(h0, W.T)) v = rng_factory.binomial(p=pv, size=pv.shape, dtype=W.dtype) ph = T.nnet.sigmoid(T.dot(v, W)) h = rng_factory.binomial(p=ph, size=ph.shape, dtype=W.dtype) class _ElemwiseNoGradient(theano.tensor.Elemwise): def grad(self, inputs, output_gradients): raise TypeError("You shouldn't be differentiating through " "the sampling process.") return [ theano.gradient.DisconnectedType()() ] block_gradient = _ElemwiseNoGradient(theano.scalar.identity) v = block_gradient(v) h = block_gradient(h)
from cle.cle.train import Training from cle.cle.train.ext import (EpochCount, GradientClipping, Monitoring, Picklize, EarlyStopping, WeightNorm) from cle.cle.train.opt import Adam from cle.cle.utils import init_tparams, sharedX from cle.cle.utils.compat import OrderedDict from cle.cle.utils.op import Gaussian_sample, GMM_sample, GMM_sampleY from cle.cle.utils.gpu_op import concatenate from preprocessing.ukdale import UKdale from theano.sandbox.rng_mrg import MRG_RandomStreams from ukdale_utils import plot_lines_iamondb_example seed_rng = np.random.RandomState(np.random.randint(1024)) theano_seed = seed_rng.randint(np.iinfo(np.int32).max) default_theano_rng = MRG_RandomStreams(theano_seed) def main(args): theano.optimizer = 'fast_compile' #theano.config.exception_verbosity='high' trial = int(args['trial']) pkl_name = 'vrnn_gmm_%d' % trial channel_name = 'valid_nll_upper_bound' data_path = args['data_path'] save_path = args[ 'save_path'] #+'/aggVSdisag_distrib/'+datetime.datetime.now().strftime("%y-%m-%d_%H-%M") period = int(args['period'])
class FeedforwardNet_SVI: """ Implements a feedforward neural network trained using stochastic variational inference. Supports various types of layers and loss functions. """ def __init__(self, n_inputs): """ Constructs a net with a given number of inputs and no layers. """ assert util.math.isposint( n_inputs), 'Number of inputs must be a positive integer.' self.n_inputs = n_inputs self.n_outputs = n_inputs self.n_units = [n_inputs] self.n_layers = 0 self.n_params = 0 self.mWs = [] self.mbs = [] self.sWs = [] self.sbs = [] self.uas = [] self.mas = [] self.zas = [] self.hs = [tt.matrix('x')] self.mps = self.mWs + self.mbs self.sps = self.sWs + self.sbs self.parms = self.mps + self.sps self.input = self.hs[0] self.output = self.hs[-1] self.srng = RandomStreams() # theano functions self.eval_f = None self.eval_f_rand = None def reset_theano_functions(self): """ Resets theano functions, so that they are compiled again when needed. """ self.eval_f = None self.eval_f_rand = None def addLayer(self, n_units, type, rng=np.random): """ Adds a new layer to the network, :param n_units: number of units in the layer :param type: a string specification of the activation function """ # check number of units assert util.math.isposint( n_units), 'Number of units must be a positive integer.' # choose activation function actfun = util.ml.select_theano_act_function(type, dtype) n_prev_units = self.n_outputs self.n_outputs = n_units self.n_units.append(n_units) self.n_layers += 1 self.n_params += 2 * (n_prev_units + 1) * n_units mW = theano.shared((rng.randn(n_prev_units, n_units) / np.sqrt(n_prev_units + 1)).astype(dtype), name='mW' + str(self.n_layers), borrow=True) mb = theano.shared(np.zeros(n_units, dtype=dtype), name='mb' + str(self.n_layers), borrow=True) sW = theano.shared(-5.0 * np.ones([n_prev_units, n_units], dtype=dtype), name='sW' + str(self.n_layers), borrow=True) sb = theano.shared(-5.0 * np.ones(n_units, dtype=dtype), name='sb' + str(self.n_layers), borrow=True) ua = self.srng.normal((self.hs[-1].shape[0], n_units), dtype=dtype) ma = tt.dot(self.hs[-1], mW) + mb sa = tt.dot(self.hs[-1]**2, tt.exp(2 * sW)) + tt.exp(2 * sb) za = tt.sqrt(sa) * ua + ma h = actfun(za) h.name = 'h' + str(self.n_layers) self.mWs.append(mW) self.mbs.append(mb) self.sWs.append(sW) self.sbs.append(sb) self.uas.append(ua) self.mas.append(ma) self.zas.append(za) self.hs.append(h) self.mps = self.mWs + self.mbs self.sps = self.sWs + self.sbs self.parms = self.mps + self.sps self.output = self.hs[-1] self.reset_theano_functions() def removeLayer(self): """ Removes a layer from the network. """ assert self.n_layers > 0, 'There is no layer to remove.' n_params_to_rem = 2 * self.n_outputs * (self.n_units[-2] + 1) self.n_outputs = self.n_units[-2] self.n_units.pop() self.n_layers -= 1 self.n_params -= n_params_to_rem self.mWs.pop() self.mbs.pop() self.sWs.pop() self.sbs.pop() self.uas.pop() self.mas.pop() self.zas.pop() self.hs.pop() self.mps = self.mWs + self.mbs self.sps = self.sWs + self.sbs self.parms = self.mps + self.sps self.output = self.hs[-1] self.reset_theano_functions() def eval(self, x, rand=False): """ Evaluate net at locations in x. """ x = np.asarray(x, dtype=dtype) if rand: # compile theano computation graph, if haven't already done so if self.eval_f_rand is None: n_data = tt.iscalar('n_data') uas = [ tt.tile(self.srng.normal((n_units, ), dtype=dtype), [n_data, 1]) for n_units in self.n_units[1:] ] self.eval_f_rand = theano.function(inputs=[self.hs[0], n_data], outputs=self.hs[-1], givens=list( zip(self.uas, uas))) return self.eval_f_rand(x[np.newaxis, :], 1)[0] if x.ndim == 1 else self.eval_f_rand( x, x.shape[0]) else: # compile theano computation graph, if haven't already done so if self.eval_f is None: self.eval_f = theano.function(inputs=[self.hs[0]], outputs=self.hs[-1], givens=list( zip(self.zas, self.mas))) return self.eval_f( x[np.newaxis, :])[0] if x.ndim == 1 else self.eval_f(x) def printInfo(self): """ Prints some useful info about the net. """ print('Number of inputs =', self.n_inputs) print('Number of outputs =', self.n_outputs) print('Number of units =', self.n_units) print('Number of layers =', self.n_layers) print('Number of params =', self.n_params) print('Data type =', dtype) def visualize_weights(self, layer, imsize, layout): """ Displays the weights of a specified layer as images. :param layer: the layer whose weights to display :param imsize: the image size :param layout: number of rows and columns for each page :return: none """ util.plot.disp_imdata(self.mWs[layer].get_value().T, imsize, layout) plt.show(block=False) def visualize_activations(self, x, layers=None): """ Visualizes the activations of specified layers caused by a given data minibatch. :param x: a minibatch of data :param layers: list of layers to visualize activations of; defaults to the whole net except the input layer :return: none """ if layers is None: layers = range(self.n_layers) forwprop = theano.function(inputs=[self.hs[0]], outputs=self.hs[1:]) hs = forwprop(x.astype(dtype)) for l in layers: fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.imshow(hs[l], cmap='gray', interpolation='none') ax.set_title('Layer ' + str(l)) ax.set_xlabel('layer units') ax.set_ylabel('data points') plt.show(block=False) def param_hist(self, layers=None): """ Displays a histogram of weights and biases for specified layers. :param layers: list of layers to show histograms for; defaults to the whole net :return: none """ if layers is None: layers = range(self.n_layers) for l in layers: fig, axs = plt.subplots(2, 2) nbins = int(np.sqrt(self.mWs[l].get_value().size)) axs[0, 0].hist(self.mWs[l].get_value().flatten(), nbins, normed=True) axs[0, 0].set_title('weight means, layer ' + str(l)) axs[1, 0].hist(self.sWs[l].get_value().flatten(), nbins, normed=True) axs[1, 0].set_title('weight log stds, layer ' + str(l)) nbins = int(np.sqrt(self.mbs[l].get_value().size)) axs[0, 1].hist(self.mbs[l].get_value(), nbins, normed=True) axs[0, 1].set_title('bias means, layer ' + str(l)) axs[1, 1].hist(self.sbs[l].get_value(), nbins, normed=True) axs[1, 1].set_title('bias log stds, layer ' + str(l)) plt.show(block=False)
def random_binomial(shape, p=0.0, dtype=_FLOATX, seed=None): if seed is None: seed = np.random.randint(1, 10e6) rng = RandomStreams(seed=seed) return rng.binomial(shape, p=p, dtype=dtype)
def sample(p, seed=None): if seed is None: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) return rng.multinomial(n=1, pvals=p, dtype=theano.config.floatX)
def random_uniform(shape, low=0.0, high=1.0, dtype=_FLOATX, seed=None): if seed is None: seed = np.random.randint(10e6) rng = RandomStreams(seed=seed) return rng.uniform(shape, low=low, high=high, dtype=dtype)
def __init__(self, incoming, sigma=0.1, **kwargs): super(MultiplicativeGaussianNoiseLayer, self).__init__(incoming, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.sigma = sigma
class VAELayer(Layer): def __init__(self, incoming, encoder, decoder, x_distribution='bernoulli', pz_distribution='gaussian', qz_distribution='gaussian', latent_size=50, W=init.Normal(0.01), b=init.Normal(0.01), **kwargs): super(VAELayer, self).__init__(incoming, **kwargs) num_batch, n_features = self.input_shape self.num_batch = num_batch self.n_features = n_features self.x_distribution = x_distribution self.pz_distribution = pz_distribution self.qz_distribution = qz_distribution self.encoder = encoder self.decoder = decoder self._srng = RandomStreams() if self.x_distribution not in ['gaussian', 'bernoulli']: raise NotImplementedError if self.pz_distribution not in ['gaussian', 'gaussianmarg']: raise NotImplementedError if self.qz_distribution not in ['gaussian', 'gaussianmarg']: raise NotImplementedError self.params_encoder = lasagne.layers.get_all_params(encoder) self.params_decoder = lasagne.layers.get_all_params(decoder) for p in self.params_encoder: p.name = "VAELayer encoder :" + p.name for p in self.params_decoder: p.name = "VAELayer decoder :" + p.name self.num_hid_enc = encoder.output_shape[1] self.num_hid_dec = decoder.output_shape[1] self.latent_size = latent_size self.W_enc_to_z_mu = self.add_param(W, (self.num_hid_enc, latent_size)) self.b_enc_to_z_mu = self.add_param(b, (latent_size, )) self.W_enc_to_z_logsigma = self.add_param( W, (self.num_hid_enc, self.latent_size)) self.b_enc_to_z_logsigma = self.add_param(b, (latent_size, )) self.W_dec_to_x_mu = self.add_param( W, (self.num_hid_dec, self.n_features)) self.b_dec_to_x_mu = self.add_param(b, (self.n_features, )) self.W_params = [ self.W_enc_to_z_mu, self.W_enc_to_z_logsigma, self.W_dec_to_x_mu ] + self.params_encoder + self.params_decoder self.bias_params = [ self.b_enc_to_z_mu, self.b_enc_to_z_logsigma, self.b_dec_to_x_mu ] params_tmp = [] if self.x_distribution == 'gaussian': self.W_dec_to_x_logsigma = self.add_param( W, (self.num_hid_dec, self.n_features)) self.b_dec_to_x_logsigma = self.add_param(b, (self.n_features, )) self.W_params += [self.W_dec_to_x_logsigma] self.bias_params += [self.b_dec_to_x_logsigma] self.W_dec_to_x_logsigma.name = "VAE: W_dec_to_x_logsigma" self.b_dec_to_x_logsigma.name = "VAE: b_dec_to_x_logsigma" params_tmp = [self.W_dec_to_x_logsigma, self.b_dec_to_x_logsigma] self.params = self.params_encoder + [self.W_enc_to_z_mu, self.b_enc_to_z_mu, self.W_enc_to_z_logsigma, self.b_enc_to_z_logsigma] + self.params_decoder + \ [self.W_dec_to_x_mu, self.b_dec_to_x_mu] + params_tmp self.W_enc_to_z_mu.name = "VAELayer: W_enc_to_z_mu" self.W_enc_to_z_logsigma.name = "VAELayer: W_enc_to_z_logsigma" self.W_dec_to_x_mu.name = "VAELayer: W_dec_to_x_mu" self.b_enc_to_z_mu.name = "VAELayer: b_enc_to_z_mu" self.b_enc_to_z_logsigma.name = "VAELayer: b_enc_to_z_logsigma" self.b_dec_to_x_mu.name = "VAELayer: b_dec_to_x_mu" def get_params(self): return self.params def get_output_shape_for(self, input_shape): dec_out_shp = self.decoder.get_output_shape_for( (self.num_batch, self.num_hid_dec)) if self.x_distribution == 'bernoulli': return dec_out_shp elif self.x_distribution == 'gaussian': return [dec_out_shp, dec_out_shp] def _encoder_output(self, x, *args, **kwargs): return lasagne.layers.get_output(self.encoder, x, **kwargs) def decoder_output(self, z, *args, **kwargs): h_decoder = lasagne.layers.get_output(self.decoder, z, **kwargs) if self.x_distribution == 'gaussian': mu_decoder = T.dot(h_decoder, self.W_dec_to_x_mu) + self.b_dec_to_x_mu log_sigma_decoder = T.dot( h_decoder, self.W_dec_to_x_logsigma) + self.b_dec_to_x_logsigma decoder_out = mu_decoder, log_sigma_decoder elif self.x_distribution == 'bernoulli': # TODO: Finish writing the output of the decoder for a bernoulli distributed x. decoder_out = T.nnet.sigmoid( T.dot(h_decoder, self.W_dec_to_x_mu) + self.b_dec_to_x_mu) else: raise NotImplementedError return decoder_out def get_z_mu_sigma(self, x, *args, **kwargs): h_encoder = self._encoder_output(x, *args, **kwargs) mu_encoder = T.dot(h_encoder, self.W_enc_to_z_mu) + self.b_enc_to_z_mu log_sigma_encoder = (T.dot(h_encoder, self.W_enc_to_z_logsigma) + self.b_enc_to_z_logsigma) eps = self._srng.normal(log_sigma_encoder.shape) # TODO: Calculate the sampled z. z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps return z, mu_encoder, log_sigma_encoder def get_log_distributions(self, x, *args, **kwargs): # sample z from q(z|x). h_encoder = self._encoder_output(x, *args, **kwargs) mu_encoder = T.dot(h_encoder, self.W_enc_to_z_mu) + self.b_enc_to_z_mu log_sigma_encoder = (T.dot(h_encoder, self.W_enc_to_z_logsigma) + self.b_enc_to_z_logsigma) eps = self._srng.normal(log_sigma_encoder.shape) z = mu_encoder + T.exp(0.5 * log_sigma_encoder) * eps # forward pass z through decoder to generate p(x|z). decoder_out = self.decoder_output(z, *args, **kwargs) if self.x_distribution == 'bernoulli': x_mu = decoder_out log_px_given_z = -T.nnet.binary_crossentropy(x_mu, x) elif self.x_distribution == 'gaussian': x_mu, x_logsigma = decoder_out log_px_given_z = normal2(x, x_mu, x_logsigma) # sample prior distribution p(z). if self.pz_distribution == 'gaussian': log_pz = standard_normal(z) elif self.pz_distribution == 'gaussianmarg': log_pz = -0.5 * (T.log(2 * np.pi) + (T.sqr(mu_encoder) + T.exp(log_sigma_encoder))) # variational approximation distribution q(z|x) if self.qz_distribution == 'gaussian': log_qz_given_x = normal2(z, mu_encoder, log_sigma_encoder) elif self.qz_distribution == 'gaussianmarg': log_qz_given_x = -0.5 * (T.log(2 * np.pi) + 1 + log_sigma_encoder) # sum over dim 1 to get shape (,batch_size) log_px_given_z = log_px_given_z.sum( axis=1, dtype=theano.config.floatX) # sum over x log_pz = log_pz.sum(axis=1, dtype=theano.config.floatX) # sum over latent vars log_qz_given_x = log_qz_given_x.sum( axis=1, dtype=theano.config.floatX) # sum over latent vars return log_pz, log_qz_given_x, log_px_given_z def draw_sample(self, z=None, *args, **kwargs): if z is None: # draw random z z = self._srng.normal((self.num_batch, self.latent_size)) return self.decoder_output(z, *args, **kwargs)
def __init__(self, p): super(Dropout, self).__init__() self.p = p self.srng = RandomStreams(seed=np.random.randint(10e6))
LATENT_DIM = args.latent_dim ALPHA_ITERS = args.alpha_iters VANILLA = False LR = 1e-3 BATCH_SIZE = 100 N_CHANNELS = 1 HEIGHT = 28 WIDTH = 28 TEST_BATCH_SIZE = 100 TIMES = ('iters', 500, 500 * 400, 500, 400 * 500, 2 * ALPHA_ITERS) lib.print_model_settings(locals().copy()) theano_srng = RandomStreams(seed=234) np.random.seed(123) def PixCNNGate(x): a = x[:, ::2] b = x[:, 1::2] return T.tanh(a) * T.nnet.sigmoid(b) def PixCNN_condGate(x, z, dim, activation='tanh', name=""): a = x[:, ::2] b = x[:, 1::2] Z_to_tanh = lib.ops.linear.Linear(name + ".tanh",
class RandomizedRectifierLayer(Layer): """ A layer that applies a randomized leaky rectify nonlinearity to its input. The randomized leaky rectifier was first proposed and used in the Kaggle NDSB Competition, and later evaluated in [1]_. Compared to the standard leaky rectifier :func:`leaky_rectify`, it has a randomly sampled slope for negative input during training, and a fixed slope during evaluation. Equation for the randomized rectifier linear unit during training: :math:`\\varphi(x) = \\max((\\sim U(lower, upper)) \\cdot x, x)` During evaluation, the factor is fixed to the arithmetic mean of `lower` and `upper`. Parameters ---------- incoming : a :class:`Layer` instance or a tuple The layer feeding into this layer, or the expected input shape lower : Theano shared variable, expression, or constant The lower bound for the randomly chosen slopes. upper : Theano shared variable, expression, or constant The upper bound for the randomly chosen slopes. shared_axes : 'auto', 'all', int or tuple of int The axes along which the random slopes of the rectifier units are going to be shared. If ``'auto'`` (the default), share over all axes except for the second - this will share the random slope over the minibatch dimension for dense layers, and additionally over all spatial dimensions for convolutional layers. If ``'all'``, share over all axes, thus using a single random slope. **kwargs Any additional keyword arguments are passed to the `Layer` superclass. References ---------- .. [1] Bing Xu, Naiyan Wang et al. (2015): Empirical Evaluation of Rectified Activations in Convolutional Network, http://arxiv.org/abs/1505.00853 """ def __init__(self, incoming, lower=0.3, upper=0.8, shared_axes='auto', **kwargs): super(RandomizedRectifierLayer, self).__init__(incoming, **kwargs) self._srng = RandomStreams(get_rng().randint(1, 2147462579)) self.lower = lower self.upper = upper if not isinstance(lower > upper, theano.Variable) and lower > upper: raise ValueError("Upper bound for RandomizedRectifierLayer needs " "to be higher than lower bound.") if shared_axes == 'auto': self.shared_axes = (0,) + tuple(range(2, len(self.input_shape))) elif shared_axes == 'all': self.shared_axes = tuple(range(len(self.input_shape))) elif isinstance(shared_axes, int): self.shared_axes = (shared_axes,) else: self.shared_axes = shared_axes def get_output_for(self, input, deterministic=False, **kwargs): """ Parameters ---------- input : tensor output from the previous layer deterministic : bool If true, the arithmetic mean of lower and upper are used for the leaky slope. """ if deterministic or self.upper == self.lower: return theano.tensor.nnet.relu(input, (self.upper+self.lower)/2.0) else: shape = list(self.input_shape) if any(s is None for s in shape): shape = list(input.shape) for ax in self.shared_axes: shape[ax] = 1 rnd = self._srng.uniform(tuple(shape), low=self.lower, high=self.upper, dtype=theano.config.floatX) rnd = theano.tensor.addbroadcast(rnd, *self.shared_axes) return theano.tensor.nnet.relu(input, rnd)
class DropoutLayer(BaseLayer): """ This class implements dropout for layer output energy (activations). """ def __init__(self, drop_probability=0.5, rescale=False, seed=4455): """ This function initializes the class. Input and output tensor shape is equal. Parameters ---------- drop_probability: float, default: 0.5 a float value ratio of how many activations will be zero. rescale: bool, default: True a bool value whether we rescale the output or not. multiply ratio and preserve the variance. seed: int an integer for random seed. """ super(DropoutLayer, self).__init__() # check asserts assert drop_probability >= 0 and drop_probability < 1, '"drop_probability" should be in range [0, 1).' assert isinstance(rescale, bool), '"rescale" should be a bool value whether we use dropout rescaling or not.' # set members self.drop_probability = drop_probability self.rescale = rescale self.rng = MRG(seed) # random number generator def set_shared(self): """ This function overrides the parents' one. Set shared Variables. Shared Variables ---------------- flag: scalar a scalar value to distinguish training mode and inference mode. """ self.flag = theano.shared(1, self.name + '_flag') # 1: train / -1: inference self.flag.tags = ['flag', self.name] def change_flag(self, new_flag): """ This function change flag to change training and inference mode. If flag > 0, training mode, else, inference mode. Parameters --------- new_flag: int (or float) a single scalar value to be a new flag. """ self.flag.set_value(float(new_flag)) # 1: train / -1: inference def get_output(self, input_): """ This function overrides the parents' one. Creates symbolic function to compute output from an input. The symbolic function use theano switch function conditioned by flag. Math Expression --------------- For inference: y = x For training: mask ~ U[0, 1] and sampled to binomial. y = 1 / ( 1 - drop_probability) * x * mask Parameters ---------- input_: TensorVariable Returns ------- Tensorvariable """ if self.rescale is True: coeff = 1 / (1 - self.drop_probability) else: coeff = 1 mask = self.rng.binomial(input_.shape, p=1 - self.drop_probability, dtype=input_.dtype) return T.switch(T.gt(self.flag, 0), input_ * mask * coeff, input_)
def __init__(self, sources, n_out, index, y_in=None, target=None, target_index=None, sparse=False, cost_scale=1.0, input_scale=1.0, L1=0.0, L2=0.0, L2_eye=None, varreg=0.0, output_L2_reg=0.0, output_entropy_reg=0.0, output_entropy_exp_reg=0.0, with_bias=True, mask="unity", dropout=0.0, batch_drop=False, batch_norm=False, bn_use_sample=False, layer_drop=0.0, residual=False, carry=False, sparse_filtering=False, gradient_scale=1.0, trainable=True, device=None, dtype='float32', **kwargs): """ :param list[NetworkBaseLayer.Layer] sources: list of source layers :param int n_out: output dim of W_in and dim of bias :param float L1: l1-param-norm regularization :param float L2: l2-param-norm regularization :param str mask: "unity" or "dropout" :type dropout: float """ super(Layer, self).__init__(**kwargs) self.index = index self.sources = sources ":type: list[Layer]" self.num_sources = len(sources) self.D = max([s.D for s in sources if isinstance(s, Layer)] + [0]) if mask is None: mask = 'none' self.set_attr('mask', mask) self.set_attr('dropout', dropout) self.set_attr('sparse', sparse) self.set_attr('bn_use_sample', bn_use_sample) self.set_attr('sparse_filtering', sparse_filtering) if not trainable: self.set_attr('trainable', trainable) # only store if not default self.gradient_scale = 0.0 # just to be sure else: self.gradient_scale = gradient_scale if gradient_scale != 1.0: self.set_attr('gradient_scale', gradient_scale) self.set_attr('layer_drop', layer_drop) assert not carry, "not supported anymore" self.set_attr('residual', residual) self.set_attr('n_out', n_out) self.set_attr('L1', L1) self.set_attr('L2', L2) if L2_eye: self.set_attr('L2_eye', L2_eye) self.device = device # if device else str(theano.config.device) for s in self.sources: s.transfer_output(self.device) self.set_attr('varreg', varreg) if output_L2_reg: self.set_attr('output_L2_reg', output_L2_reg) if output_entropy_reg: self.set_attr('output_entropy_reg', output_entropy_reg) if output_entropy_exp_reg: self.set_attr('output_entropy_exp_reg', output_entropy_exp_reg) self.set_attr('batch_norm', batch_norm) self.set_attr('input_scale', input_scale) if y_in is not None: self.y_in = {} for k in y_in: if not isinstance(y_in[k], T.Variable): continue self.y_in[k] = time_batch_make_flat( y_in[k]) # TODO: better not flatten here... self.y_in[k].n_out = getattr(y_in[k], "n_out", None) else: self.y_in = None self.constraints = T.constant(0) if target: self.set_attr('target', target) if target_index: self.set_attr('target_index', target_index) assert target_index in self.network.j self.index = index = self.network.j[target_index] if cost_scale != 1: self.set_attr("cost_scale", cost_scale) if with_bias: self.b = self.add_param(self.create_bias(n_out), 'b_%s' % self.name) else: self.set_attr('with_bias', False) self.b = numpy.float32(0) self.mass = T.constant(1., name="mass_%s" % self.name, dtype='float32') self.masks = [None] * len(self.sources) assert mask in ['dropout', 'unity', 'none'], "invalid mask: %s" % mask if mask == "dropout" or (mask == 'none' and dropout > 0): assert 0.0 < dropout < 1.0 # If we apply this mass during training then we don't need any mask or mass for testing. # The expected weight should be 1 in # E[x] = mass * (1-dropout) # so mass has to be 1 / (1 - dropout). self.mass = T.constant(1.0 / (1.0 - dropout), dtype='float32') from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams srng = RandomStreams(self.rng.randint(1234) + 1) if self.depth > 1: self.masks = [ T.cast( srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'], self.depth)), theano.config.floatX) for s in self.sources ] else: if batch_drop: self.masks = [ T.cast( srng.binomial(n=1, p=1 - dropout, size=s.output.shape), theano.config.floatX) for s in self.sources ] else: self.masks = [ T.cast( srng.binomial(n=1, p=1 - dropout, size=(s.attrs['n_out'], )), theano.config.floatX) for s in self.sources ]