def __init__(self, decay=0.9, max_scaling=1e5): assert 0. <= decay < 1. assert max_scaling > 0 self.decay = sharedX(decay, 'decay') self.epsilon = 1. / max_scaling self.mean_square_grads = OrderedDict()
def get_updates(self, learning_rate, grads, lr_scalers=None): """ Provides the symbolic (theano) description of the updates needed to perform this learning rule. See Notes for side-effects. Parameters ---------- learning_rate : float Learning rate coefficient. grads : dict A dictionary mapping from the model's parameters to their gradients. lr_scalers : dict A dictionary mapping from the model's parameters to a learning rate multiplier. Returns ------- updates : OrderdDict A dictionary mapping from the old model parameters, to their new values after a single iteration of the learning rule. Notes ----- This method has the side effect of storing the moving average of the square gradient in `self.mean_square_grads`. This is necessary in order for the monitoring channels to be able to track the value of these moving averages. Therefore, this method should only get called once for each instance of RMSProp. """ updates = OrderedDict() for param in grads: # mean_squared_grad := E[g^2]_{t-1} mean_square_grad = sharedX(param.get_value() * 0.) if param.name is None: raise ValueError("Model parameters must be named.") mean_square_grad.name = 'mean_square_grad_' + param.name if param.name in self.mean_square_grads: warnings.warn("Calling get_updates more than once on the " "gradients of `%s` may make monitored values " "incorrect." % param.name) # Store variable in self.mean_square_grads for monitoring. self.mean_square_grads[param.name] = mean_square_grad # Accumulate gradient new_mean_squared_grad = (self.decay * mean_square_grad + (1 - self.decay) * T.sqr(grads[param])) # Compute update scaled_lr = lr_scalers.get(param, 1.) * learning_rate rms_grad_t = T.sqrt(new_mean_squared_grad) rms_grad_t = T.maximum(rms_grad_t, self.epsilon) delta_x_t = -scaled_lr * grads[param] / rms_grad_t # Apply update updates[mean_square_grad] = new_mean_squared_grad updates[param] = param + delta_x_t return updates
def setup_detector_layer_c01b(layer, input_space, rng): """ .. todo:: WRITEME properly Takes steps to set up an object for use as being some kind of convolutional layer. This function sets up only the detector layer. Does the following: * raises a RuntimeError if cuda is not available * sets layer.input_space to input_space * sets up addition of dummy channels for compatibility with cuda-convnet: - layer.dummy_channels: # of dummy channels that need to be added (You might want to check this and raise an Exception if it's not 0) - layer.dummy_space: The Conv2DSpace representing the input with dummy channels added * sets layer.detector_space to the space for the detector layer * sets layer.transformer to be a Conv2D instance * sets layer.b to the right value Parameters ---------- layer : object Any python object that allows the modifications described below and has the following attributes: * pad : int describing amount of zero padding to add * kernel_shape : 2-element tuple or list describing spatial shape of kernel * fix_kernel_shape : bool, if true, will shrink the kernel shape to make it feasible, as needed (useful for hyperparameter searchers) * detector_channels : The number of channels in the detector layer * init_bias : numeric constant added to a tensor of zeros to initialize the bias * tied_b : If true, biases are shared across all spatial locations input_space : WRITEME A Conv2DSpace to be used as input to the layer rng : WRITEME A numpy RandomState or equivalent """ # Use "self" to refer to layer from now on, so we can pretend we're # just running in the set_input_space method of the layer self = layer # Make sure cuda is available check_cuda(str(type(self))) # Validate input if not isinstance(input_space, Conv2DSpace): raise TypeError("The input to a convolutional layer should be a " "Conv2DSpace, but layer " + self.layer_name + " got " + str(type(self.input_space))) if not hasattr(self, 'detector_channels'): raise ValueError("layer argument must have a 'detector_channels' " "attribute specifying how many channels to put in " "the convolution kernel stack.") # Store the input space self.input_space = input_space # Make sure number of channels is supported by cuda-convnet # (multiple of 4 or <= 3) # If not supported, pad the input with dummy channels ch = self.input_space.num_channels rem = ch % 4 if ch > 3 and rem != 0: self.dummy_channels = 4 - rem else: self.dummy_channels = 0 self.dummy_space = Conv2DSpace( shape=input_space.shape, channels=input_space.num_channels + self.dummy_channels, axes=('c', 0, 1, 'b') ) if hasattr(self, 'kernel_stride'): kernel_stride = self.kernel_stride else: kernel_stride = [1, 1] output_shape = \ [int(np.ceil((i_sh + 2. * self.pad - k_sh) / float(k_st))) + 1 for i_sh, k_sh, k_st in zip(self.input_space.shape, self.kernel_shape, kernel_stride)] def handle_kernel_shape(idx): if self.kernel_shape[idx] < 1: raise ValueError("kernel must have strictly positive size on all " "axes but has shape: " + str(self.kernel_shape)) if output_shape[idx] <= 0: if self.fix_kernel_shape: self.kernel_shape[idx] = \ self.input_space.shape[idx] + 2 * self.pad assert self.kernel_shape[idx] != 0 output_shape[idx] = 1 warnings.warn("Had to change the kernel shape to make " "network feasible") else: raise ValueError("kernel too big for input " "(even with zero padding)") map(handle_kernel_shape, [0, 1]) if self.detector_channels < 16: raise ValueError("Cuda-convnet requires the detector layer to have " "at least 16 channels.") self.detector_space = Conv2DSpace(shape=output_shape, num_channels=self.detector_channels, axes=('c', 0, 1, 'b')) if hasattr(self, 'partial_sum'): partial_sum = self.partial_sum else: partial_sum = 1 if hasattr(self, 'sparse_init') and self.sparse_init is not None: self.transformer = \ checked_call(make_sparse_random_conv2D, OrderedDict([('num_nonzero', self.sparse_init), ('input_space', self.input_space), ('output_space', self.detector_space), ('kernel_shape', self.kernel_shape), ('pad', self.pad), ('partial_sum', partial_sum), ('kernel_stride', kernel_stride), ('rng', rng)])) else: self.transformer = make_random_conv2D( irange=self.irange, input_axes=self.input_space.axes, output_axes=self.detector_space.axes, input_channels=self.dummy_space.num_channels, output_channels=self.detector_space.num_channels, kernel_shape=self.kernel_shape, pad=self.pad, partial_sum=partial_sum, kernel_stride=kernel_stride, rng=rng ) W, = self.transformer.get_params() W.name = self.layer_name + '_W' if self.tied_b: self.b = sharedX(np.zeros(self.detector_space.num_channels) + self.init_bias) else: self.b = sharedX(self.detector_space.get_origin() + self.init_bias) self.b.name = self.layer_name + '_b' logger.info('Input shape: {0}'.format(self.input_space.shape)) logger.info('Detector space: {0}'.format(self.detector_space.shape))
def estimate_likelihood(W_list, b_list, trainset, testset, free_energy_fn=None, batch_size=100, large_ais=False, log_z=None, pos_mf_steps=50, pos_sample_steps=0): """ Compute estimate of log-partition function and likelihood of trainset and testset Parameters ---------- W_list : array-like object of theano shared variables b_list : array-like object of theano shared variables Biases of the DBM trainset : pylearn2.datasets.dataset.Dataset Training set testset : pylearn2.datasets.dataset.Dataset Test set free_energy_fn : theano.function Function which, given temperature beta_k, computes the free energy of the samples stored in model.samples. This function should return a symbolic vector. batch_size : integer Size of a batch of examples large_ais : boolean If True, will use 3e5 chains, instead of 3e4 log_z : log-partition function (if precomputed) pos_mf_steps: the number of fixed-point iterations for approximate inference pos_sample_steps: same thing as pos_mf_steps when both pos_mf_steps > 0 and pos_sample_steps > 0, pos_mf_steps has a priority Returns ------- nll : scalar Negative log-likelihood of data.X under `model`. logz : scalar Estimate of log-partition function of `model`. """ warnings.warn("This is garanteed to work only for DBMs with a " + "BinaryVector visible layer and BinaryVectorMaxPool " + "hidden layers with pool sizes of 1.") # Add a dummy placeholder for visible layer's weights in W_list W_list = [None] + W_list # Depth of the DBM depth = len(b_list) # Initialize samples psamples = [] nsamples = [] for i, b in enumerate(b_list): psamples += [ utils.sharedX(rng.rand(batch_size, b.get_value().shape[0]), name='psamples%i' % i) ] nsamples += [ utils.sharedX(rng.rand(batch_size, b.get_value().shape[0]), name='nsamples%i' % i) ] psamples[0] = T.matrix('psamples0') ########################## ## BUILD THEANO FUNCTIONS ########################## beta = T.scalar() # For an even number of layers, we marginalize the odd layers # (and vice-versa) marginalize_odd = (depth % 2) == 0 # Build function to retrieve energy. E = -T.dot(nsamples[0], b_list[0]) * beta for i in xrange(1, depth): E -= T.sum(T.dot(nsamples[i - 1], W_list[i] * beta) * nsamples[i], axis=1) E -= T.dot(nsamples[i], b_list[i] * beta) energy_fn = theano.function([beta], E) # Build inference function. assert (pos_mf_steps or pos_sample_steps) pos_steps = pos_mf_steps if pos_mf_steps else pos_sample_steps new_psamples = _e_step(psamples, W_list, b_list, n_steps=pos_steps) ups = OrderedDict() for psample, new_psample in zip(psamples[1:], new_psamples[1:]): ups[psample] = new_psample temp = numpy.asarray(trainset.X, dtype=floatX) mean_train = numpy.mean(temp, axis=0) inference_fn = theano.function(inputs=[psamples[0]], outputs=[], updates=ups) # Configure baserate bias for (h0 if `marginalize_odd` else h1) inference_fn(numpy.tile(mean_train, (batch_size, 1))) numpy_psamples = [mean_train[None, :]] + \ [psample.get_value() for psample in psamples[1:]] mean_pos = numpy.minimum(numpy_psamples[not marginalize_odd], 1 - 1e-5) mean_pos = numpy.maximum(mean_pos, 1e-5) pa_bias = -numpy.log(1. / mean_pos[0] - 1.) # Build Theano function to sample from interpolating distributions. updates = OrderedDict() new_nsamples = neg_sampling(W_list, b_list, nsamples, beta=beta, pa_bias=pa_bias, marginalize_odd=marginalize_odd, theano_rng=theano_rng) for (nsample, new_nsample) in zip(nsamples, new_nsamples): updates[nsample] = new_nsample sample_fn = theano.function([beta], [], updates=updates, name='sample_func') # Build function to compute free-energy of p_k(h1). fe_bp_h1 = free_energy_at_beta(W_list, b_list, nsamples, beta, pa_bias, marginalize_odd=marginalize_odd) free_energy_fn = theano.function([beta], fe_bp_h1) ########### ## RUN AIS ########### # Generate exact sample for the base model. for i, nsample_i in enumerate(nsamples): bias = pa_bias if i == 1 else b_list[i].get_value() hi_mean_vec = 1. / (1. + numpy.exp(-bias)) hi_mean = numpy.tile(hi_mean_vec, (batch_size, 1)) r = rng.random_sample(hi_mean.shape) hi_sample = numpy.array(hi_mean > r, dtype=floatX) nsample_i.set_value(hi_sample) # Default configuration for interpolating distributions if large_ais: betas = numpy.cast[floatX](numpy.hstack( (numpy.linspace(0, 0.5, 1e5 + 1)[:-1], numpy.linspace(0.5, 0.9, 1e5 + 1)[:-1], numpy.linspace(0.9, 1.0, 1e5)))) else: betas = numpy.cast[floatX](numpy.hstack( (numpy.linspace(0, 0.5, 1e4 + 1)[:-1], numpy.linspace(0.5, 0.9, 1e4 + 1)[:-1], numpy.linspace(0.9, 1.0, 1e4)))) if log_z is None: log_ais_w = compute_log_ais_weights(batch_size, free_energy_fn, sample_fn, betas) dlogz, var_dlogz = estimate_from_weights(log_ais_w) log_za = compute_log_za(b_list, pa_bias, marginalize_odd) log_z = log_za + dlogz logging.info('log_z = %f' % log_z) logging.info('log_za = %f' % log_za) logging.info('dlogz = %f' % dlogz) logging.info('var_dlogz = %f' % var_dlogz) train_ll = compute_likelihood_given_logz(nsamples, psamples, batch_size, energy_fn, inference_fn, log_z, trainset.X) logging.info('Training likelihood = %f' % train_ll) test_ll = compute_likelihood_given_logz(nsamples, psamples, batch_size, energy_fn, inference_fn, log_z, testset.X) logging.info('Test likelihood = %f' % test_ll) return (train_ll, test_ll, log_z)