class RBM(Block, Model): """ A base interface for RBMs, implementing the binary-binary case. """ def __init__(self, nvis = None, nhid = None, vis_space = None, hid_space = None, transformer = None, irange=0.5, rng=None, init_bias_vis = None, init_bias_vis_marginals = None, init_bias_hid=0.0, base_lr = 1e-3, anneal_start = None, nchains = 100, sml_gibbs_steps = 1, random_patches_src = None, monitor_reconstruction = False): """ Construct an RBM object. Parameters ---------- nvis : int Number of visible units in the model. (Specifying this implies that the model acts on a vector, i.e. it sets vis_space = pylearn2.space.VectorSpace(nvis) ) nhid : int Number of hidden units in the model. (Specifying this implies that the model acts on a vector) vis_space: A pylearn2.space.Space object describing what kind of vector space the RBM acts on. Don't specify if you used nvis / hid hid_space: A pylearn2.space.Space object describing what kind of vector space the RBM's hidden units live in. Don't specify if you used nvis / nhid init_bias_vis_marginals: either None, or a Dataset to use to initialize the visible biases to the inverse sigmoid of the data marginals irange : float, optional The size of the initial interval around 0 for weights. rng : RandomState object or seed NumPy RandomState object to use when initializing parameters of the model, or (integer) seed to use to create one. init_bias_vis : array_like, optional Initial value of the visible biases, broadcasted as necessary. init_bias_hid : array_like, optional initial value of the hidden biases, broadcasted as necessary. monitor_reconstruction : if True, will request a monitoring channel to monitor reconstruction error random_patches_src: Either None, or a Dataset from which to draw random patches in order to initialize the weights. Patches will be multiplied by irange Parameters for default SML learning rule: base_lr : the base learning rate anneal_start : number of steps after which to start annealing on a 1/t schedule nchains: number of negative chains sml_gibbs_steps: number of gibbs steps to take per update """ Model.__init__(self) Block.__init__(self) if init_bias_vis_marginals is not None: assert init_bias_vis is None X = init_bias_vis_marginals.X assert X.min() >= 0.0 assert X.max() <= 1.0 marginals = X.mean(axis=0) #rescale the marginals a bit to avoid NaNs init_bias_vis = inverse_sigmoid_numpy(.01 + .98 * marginals) if init_bias_vis is None: init_bias_vis = 0.0 if rng is None: # TODO: global rng configuration stuff. rng = numpy.random.RandomState(1001) self.rng = rng if vis_space is None: #if we don't specify things in terms of spaces and a transformer, #assume dense matrix multiplication and work off of nvis, nhid assert hid_space is None assert transformer is None or isinstance(transformer,MatrixMul) assert nvis is not None assert nhid is not None if transformer is None: if random_patches_src is None: W = rng.uniform(-irange, irange, (nvis, nhid)) else: if hasattr(random_patches_src, '__array__'): W = irange * random_patches_src.T assert W.shape == (nvis, nhid) else: #assert type(irange) == type(0.01) #assert irange == 0.01 W = irange * random_patches_src.get_batch_design(nhid).T self.transformer = MatrixMul( sharedX( W, name='W', borrow=True ) ) else: self.transformer = transformer self.vis_space = VectorSpace(nvis) self.hid_space = VectorSpace(nhid) else: assert hid_space is not None assert transformer is not None assert nvis is None assert nhid is None self.vis_space = vis_space self.hid_space = hid_space self.transformer = transformer try: b_vis = self.vis_space.get_origin() b_vis += init_bias_vis except ValueError: raise ValueError("bad shape or value for init_bias_vis") self.bias_vis = sharedX(b_vis, name='bias_vis', borrow=True) try: b_hid = self.hid_space.get_origin() b_hid += init_bias_hid except ValueError: raise ValueError('bad shape or value for init_bias_hid') self.bias_hid = sharedX(b_hid, name='bias_hid', borrow=True) self.random_patches_src = random_patches_src self.register_names_to_del(['random_patches_src']) self.__dict__.update(nhid=nhid, nvis=nvis) self._params = safe_union(self.transformer.get_params(), [self.bias_vis, self.bias_hid]) self.base_lr = base_lr self.anneal_start = anneal_start self.nchains = nchains self.sml_gibbs_steps = sml_gibbs_steps def get_input_dim(self): if not isinstance(self.vis_space, VectorSpace): raise TypeError("Can't describe "+str(type(self.vis_space))+" as a dimensionality number.") return self.vis_space.dim def get_output_dim(self): if not isinstance(self.hid_space, VectorSpace): raise TypeError("Can't describe "+str(type(self.hid_space))+" as a dimensionality number.") return self.hid_space.dim def get_input_space(self): return self.vis_space def get_output_space(self): return self.hid_space def get_params(self): return [param for param in self._params] def get_weights(self, borrow=False): weights ,= self.transformer.get_params() return weights.get_value(borrow=borrow) def get_weights_topo(self): return self.transformer.get_weights_topo() def get_weights_format(self): return ['v', 'h'] def get_monitoring_channels(self, data): V = data theano_rng = RandomStreams(42) #TODO: re-enable this in the case where self.transformer #is a matrix multiply #norms = theano_norms(self.weights) H = self.mean_h_given_v(V) h = H.mean(axis=0) return { 'bias_hid_min' : T.min(self.bias_hid), 'bias_hid_mean' : T.mean(self.bias_hid), 'bias_hid_max' : T.max(self.bias_hid), 'bias_vis_min' : T.min(self.bias_vis), 'bias_vis_mean' : T.mean(self.bias_vis), 'bias_vis_max': T.max(self.bias_vis), 'h_min' : T.min(h), 'h_mean': T.mean(h), 'h_max' : T.max(h), #'W_min' : T.min(self.weights), #'W_max' : T.max(self.weights), #'W_norms_min' : T.min(norms), #'W_norms_max' : T.max(norms), #'W_norms_mean' : T.mean(norms), 'reconstruction_error' : self.reconstruction_error(V, theano_rng) } def get_monitoring_data_specs(self): """ Get the data_specs describing the data for get_monitoring_channel. This implementation returns specification corresponding to unlabeled inputs. """ return (self.get_input_space(), self.get_input_source()) def ml_gradients(self, pos_v, neg_v): """ Get the contrastive gradients given positive and negative phase visible units. Parameters ---------- pos_v : tensor_like Theano symbolic representing a minibatch on the visible units, with the first dimension indexing training examples and the second indexing data dimensions (usually actual training data). neg_v : tensor_like Theano symbolic representing a minibatch on the visible units, with the first dimension indexing training examples and the second indexing data dimensions (usually reconstructions of the data or sampler particles from a persistent Markov chain). Returns ------- grads : list List of Theano symbolic variables representing gradients with respect to model parameters, in the same order as returned by `params()`. Notes ----- `pos_v` and `neg_v` need not have the same first dimension, i.e. minibatch size. """ # taking the mean over each term independently allows for different # mini-batch sizes in the positive and negative phase. ml_cost = (self.free_energy_given_v(pos_v).mean() - self.free_energy_given_v(neg_v).mean()) grads = tensor.grad(ml_cost, self.get_params(), consider_constant=[pos_v, neg_v]) return grads def train_batch(self, dataset, batch_size): """ A default learning rule based on SML """ self.learn_mini_batch(dataset.get_batch_design(batch_size)) return True def learn_mini_batch(self, X): """ A default learning rule based on SML """ if not hasattr(self, 'learn_func'): self.redo_theano() rval = self.learn_func(X) return rval def redo_theano(self): """ Compiles the theano function for the default learning rule """ init_names = dir(self) minibatch = tensor.matrix() optimizer = _SGDOptimizer(self, self.base_lr, self.anneal_start) sampler = sampler = BlockGibbsSampler(self, 0.5 + np.zeros((self.nchains, self.get_input_dim())), self.rng, steps= self.sml_gibbs_steps) updates = training_updates(visible_batch=minibatch, model=self, sampler=sampler, optimizer=optimizer) self.learn_func = theano.function([minibatch], updates=updates) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names]) def gibbs_step_for_v(self, v, rng): """ Do a round of block Gibbs sampling given visible configuration Parameters ---------- v : tensor_like Theano symbolic representing the hidden unit states for a batch of training examples (or negative phase particles), with the first dimension indexing training examples and the second indexing data dimensions. rng : RandomStreams object Random number generator to use for sampling the hidden and visible units. Returns ------- v_sample : tensor_like Theano symbolic representing the new visible unit state after one round of Gibbs sampling. locals : dict Contains the following auxiliary state as keys (all symbolics except shape tuples): * `h_mean`: the returned value from `mean_h_given_v` * `h_mean_shape`: shape tuple indicating the size of `h_mean` and `h_sample` * `h_sample`: the stochastically sampled hidden units * `v_mean_shape`: shape tuple indicating the shape of `v_mean` and `v_sample` * `v_mean`: the returned value from `mean_v_given_h` * `v_sample`: the stochastically sampled visible units """ h_mean = self.mean_h_given_v(v) assert h_mean.type.dtype == v.type.dtype # For binary hidden units # TODO: factor further to extend to other kinds of hidden units # (e.g. spike-and-slab) h_sample = rng.binomial(size = h_mean.shape, n = 1 , p = h_mean, dtype=h_mean.type.dtype) assert h_sample.type.dtype == v.type.dtype # v_mean is always based on h_sample, not h_mean, because we don't # want h transmitting more than one bit of information per unit. v_mean = self.mean_v_given_h(h_sample) assert v_mean.type.dtype == v.type.dtype v_sample = self.sample_visibles([v_mean], v_mean.shape, rng) assert v_sample.type.dtype == v.type.dtype return v_sample, locals() def sample_visibles(self, params, shape, rng): """ Stochastically sample the visible units given hidden unit configurations for a set of training examples. Parameters ---------- params : list List of the necessary parameters to sample :math:`p(v|h)`. In the case of a binary-binary RBM this is a single-element list containing the symbolic representing :math:`p(v|h)`, as returned by `mean_v_given_h`. Returns ------- vprime : tensor_like Theano symbolic representing stochastic samples from :math:`p(v|h)` """ v_mean = params[0] return as_floatX(rng.uniform(size=shape) < v_mean) def input_to_h_from_v(self, v): """ Compute the affine function (linear map plus bias) that serves as input to the hidden layer in an RBM. Parameters ---------- v : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the one or several minibatches on the visible units, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- a : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the input to each hidden unit for each training example. """ if isinstance(v, tensor.Variable): return self.bias_hid + self.transformer.lmul(v) else: return [self.input_to_h_from_v(vis) for vis in v] def input_to_v_from_h(self, h): """ Compute the affine function (linear map plus bias) that serves as input to the visible layer in an RBM. Parameters ---------- h : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the one or several minibatches on the hidden units, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- a : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the input to each visible unit for each row of h. """ if isinstance(h, tensor.Variable): return self.bias_vis + self.transformer.lmul_T(h) else: return [self.input_to_v_from_h(hid) for hid in h] def upward_pass(self, v): """ wrapper around mean_h_given_v method. Called when RBM is accessed by mlp.HiddenLayer. """ return self.mean_h_given_v(v) def mean_h_given_v(self, v): """ Compute the mean activation of the hidden units given visible unit configurations for a set of training examples. Parameters ---------- v : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the hidden unit states for a batch (or several) of training examples, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- h : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the mean (deterministic) hidden unit activations given the visible units. """ if isinstance(v, tensor.Variable): return nnet.sigmoid(self.input_to_h_from_v(v)) else: return [self.mean_h_given_v(vis) for vis in v] def mean_v_given_h(self, h): """ Compute the mean activation of the visibles given hidden unit configurations for a set of training examples. Parameters ---------- h : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the hidden unit states for a batch (or several) of training examples, with the first dimension indexing training examples and the second indexing hidden units. Returns ------- vprime : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the mean (deterministic) reconstruction of the visible units given the hidden units. """ if isinstance(h, tensor.Variable): return nnet.sigmoid(self.input_to_v_from_h(h)) else: return [self.mean_v_given_h(hid) for hid in h] def free_energy_given_v(self, v): """ Calculate the free energy of a visible unit configuration by marginalizing over the hidden units. Parameters ---------- v : tensor_like Theano symbolic representing the hidden unit states for a batch of training examples, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- f : tensor_like 1-dimensional tensor (vector) representing the free energy associated with each row of v. """ sigmoid_arg = self.input_to_h_from_v(v) return (-tensor.dot(v, self.bias_vis) - nnet.softplus(sigmoid_arg).sum(axis=1)) def free_energy(self, V): return self.free_energy_given_v(V) def free_energy_given_h(self, h): """ Calculate the free energy of a hidden unit configuration by marginalizing over the visible units. Parameters ---------- h : tensor_like Theano symbolic representing the hidden unit states, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- f : tensor_like 1-dimensional tensor (vector) representing the free energy associated with each row of v. """ sigmoid_arg = self.input_to_v_from_h(h) return (-tensor.dot(h, self.bias_hid) - nnet.softplus(sigmoid_arg).sum(axis=1)) def __call__(self, v): """ Forward propagate (symbolic) input through this module, obtaining a representation to pass on to layers above. This just aliases the `mean_h_given_v()` function for syntactic sugar/convenience. """ return self.mean_h_given_v(v) def reconstruction_error(self, v, rng): """ Compute the mean-squared error (mean over examples, sum over units) across a minibatch after a Gibbs step starting from the training data. Parameters ---------- v : tensor_like Theano symbolic representing the hidden unit states for a batch of training examples, with the first dimension indexing training examples and the second indexing data dimensions. rng : RandomStreams object Random number generator to use for sampling the hidden and visible units. Returns ------- mse : tensor_like 0-dimensional tensor (essentially a scalar) indicating the mean reconstruction error across the minibatch. Notes ----- The reconstruction used to assess error samples only the hidden units. For the visible units, it uses the conditional mean. No sampling of the visible units is done, to reduce noise in the estimate. """ sample, _locals = self.gibbs_step_for_v(v, rng) return ((_locals['v_mean'] - v) ** 2).sum(axis=1).mean()
class BinaryVectorMaxPool(HiddenLayer): """ A hidden layer that does max-pooling on binary vectors. It has two sublayers, the detector layer and the pooling layer. The detector layer is its downward state and the pooling layer is its upward state. TODO: this layer uses (pooled, detector) as its total state, which can be confusing when listing all the states in the network left to right. Change this and pylearn2.expr.probabilistic_max_pooling to use (detector, pooled) """ def __init__(self, detector_layer_dim, pool_size, layer_name, irange=None, sparse_init=None, include_prob=1.0, init_bias=0.): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX(np.zeros((self.detector_layer_dim, )) + init_bias, name=layer_name + '_b') def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if not (self.detector_layer_dim % self.pool_size == 0): raise ValueError( "detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" % (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size)) self.h_space = VectorSpace(self.detector_layer_dim) self.pool_layer_dim = self.detector_layer_dim / self.pool_size self.output_space = VectorSpace(self.pool_layer_dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.detector_layer_dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.detector_layer_dim)) for i in xrange(self.detector_layer_dim): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W, = self.transformer.get_params() assert W.name is not None def get_total_state_space(self): return CompositeSpace((self.output_space, self.h_space)) def get_params(self): assert self.b.name is not None W, = self.transformer.get_params() assert W.name is not None return self.transformer.get_params().union([self.b]) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) W, = self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W, = self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_view_shape(self): total = self.detector_layer_dim cols = self.pool_size if cols == 1: # Let the PatchViewer decidew how to arrange the units # when they're not pooled raise NotImplementedError() # When they are pooled, make each pooling unit have one row rows = total / cols return rows, cols def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W, = self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): p, h = total_state self.h_space.validate(h) self.output_space.validate(p) return p def downward_state(self, total_state): p, h = total_state return h def get_monitoring_channels_from_state(self, state): P, H = state rval = {} if self.pool_size == 1: vars_and_prefixes = [(P, '')] else: vars_and_prefixes = [(P, 'p_'), (H, 'h_')] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min for key, val in [('max_max', v_max.max()), ('max_mean', v_max.mean()), ('max_min', v_max.min()), ('min_max', v_min.max()), ('min_mean', v_min.mean()), ('min_max', v_min.max()), ('range_max', v_range.max()), ('range_mean', v_range.mean()), ('range_min', v_range.min()), ('mean_max', v_mean.max()), ('mean_mean', v_mean.mean()), ('mean_min', v_mean.min())]: rval[prefix + key] = val return rval def get_l1_act_cost(self, state, target, coeff, eps=None): rval = 0. P, H = state self.output_space.validate(P) self.h_space.validate(H) if self.pool_size == 1: # If the pool size is 1 then pools = detectors # and we should not penalize pools and detectors separately assert len(state) == 2 assert isinstance(target, float) assert isinstance(coeff, float) _, state = state state = [state] target = [target] coeff = [coeff] if eps is None: eps = [0.] else: eps = [eps] else: assert all([len(elem) == 2 for elem in [state, target, coeff]]) if eps is None: eps = [0., 0.] if target[1] < target[0]: warnings.warn( "Do you really want to regularize the detector units to be sparser than the pooling units?" ) for s, t, c, e in safe_zip(state, target, coeff, eps): assert all([isinstance(elem, float) for elem in [t, c, e]]) if c == 0.: continue m = s.mean(axis=0) assert m.ndim == 1 rval += T.maximum(abs(m - t) - e, 0.).mean() * c return rval def sample(self, state_below=None, state_above=None, layer_above=None, theano_rng=None): if theano_rng is None: raise ValueError( "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list." ) if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg, theano_rng) return p_sample, h_sample def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.h_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16)) p_exp, h_exp, p_sample, h_sample = max_pool_channels( z=default_z, pool_size=self.pool_size, theano_rng=theano_rng) assert h_sample.dtype == default_z.dtype p_state = sharedX(self.output_space.get_origin_batch(num_examples)) t2 = time.time() f = function([], updates={p_state: p_sample, h_state: h_sample}) t3 = time.time() f() t4 = time.time() print str(self) + '.make_state took', t4 - t1 print '\tcompose time:', t2 - t1 print '\tcompile time:', t3 - t2 print '\texecute time:', t4 - t3 p_state.name = 'p_sample_shared' h_state.name = 'h_sample_shared' return p_state, h_state def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) downward_state = self.downward_state(state) self.h_space.validate(downward_state) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(downward_state, self.b) weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def mf_update(self, state_below, state_above, layer_above=None, double_weights=False, iter_name=None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_' + layer_above.layer_name + '_to_' + self.layer_name + '[' + iter_name + ']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_' + iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' p, h = max_pool_channels(z, self.pool_size, msg) p.name = self.layer_name + '_p_' + iter_name h.name = self.layer_name + '_h_' + iter_name return p, h
class RBM(Block, Model): """ A base interface for RBMs, implementing the binary-binary case. """ def __init__(self, nvis = None, nhid = None, vis_space = None, hid_space = None, transformer = None, irange=0.5, rng=None, init_bias_vis = None, init_bias_vis_marginals = None, init_bias_hid=0.0, base_lr = 1e-3, anneal_start = None, nchains = 100, sml_gibbs_steps = 1, random_patches_src = None, monitor_reconstruction = False): """ Construct an RBM object. Parameters ---------- nvis : int Number of visible units in the model. (Specifying this implies that the model acts on a vector, i.e. it sets vis_space = pylearn2.space.VectorSpace(nvis) ) nhid : int Number of hidden units in the model. (Specifying this implies that the model acts on a vector) vis_space: A pylearn2.space.Space object describing what kind of vector space the RBM acts on. Don't specify if you used nvis / hid hid_space: A pylearn2.space.Space object describing what kind of vector space the RBM's hidden units live in. Don't specify if you used nvis / nhid init_bias_vis_marginals: either None, or a Dataset to use to initialize the visible biases to the inverse sigmoid of the data marginals irange : float, optional The size of the initial interval around 0 for weights. rng : RandomState object or seed NumPy RandomState object to use when initializing parameters of the model, or (integer) seed to use to create one. init_bias_vis : array_like, optional Initial value of the visible biases, broadcasted as necessary. init_bias_hid : array_like, optional initial value of the hidden biases, broadcasted as necessary. monitor_reconstruction : if True, will request a monitoring channel to monitor reconstruction error random_patches_src: Either None, or a Dataset from which to draw random patches in order to initialize the weights. Patches will be multiplied by irange Parameters for default SML learning rule: base_lr : the base learning rate anneal_start : number of steps after which to start annealing on a 1/t schedule nchains: number of negative chains sml_gibbs_steps: number of gibbs steps to take per update """ Model.__init__(self) Block.__init__(self) if init_bias_vis_marginals is not None: assert init_bias_vis is None X = init_bias_vis_marginals.X assert X.min() >= 0.0 assert X.max() <= 1.0 marginals = X.mean(axis=0) #rescale the marginals a bit to avoid NaNs init_bias_vis = inverse_sigmoid_numpy(.01 + .98 * marginals) if init_bias_vis is None: init_bias_vis = 0.0 if rng is None: # TODO: global rng configuration stuff. rng = numpy.random.RandomState(1001) self.rng = rng if vis_space is None: #if we don't specify things in terms of spaces and a transformer, #assume dense matrix multiplication and work off of nvis, nhid assert hid_space is None assert transformer is None or isinstance(transformer,MatrixMul) assert nvis is not None assert nhid is not None if transformer is None: if random_patches_src is None: W = rng.uniform(-irange, irange, (nvis, nhid)) else: if hasattr(random_patches_src, '__array__'): W = irange * random_patches_src.T assert W.shape == (nvis, nhid) else: #assert type(irange) == type(0.01) #assert irange == 0.01 W = irange * random_patches_src.get_batch_design(nhid).T self.transformer = MatrixMul( sharedX( W, name='W', borrow=True ) ) else: self.transformer = transformer self.vis_space = VectorSpace(nvis) self.hid_space = VectorSpace(nhid) else: assert hid_space is not None assert transformer is not None assert nvis is None assert nhid is None self.vis_space = vis_space self.hid_space = hid_space self.transformer = transformer try: b_vis = self.vis_space.get_origin() b_vis += init_bias_vis except ValueError: raise ValueError("bad shape or value for init_bias_vis") self.bias_vis = sharedX(b_vis, name='bias_vis', borrow=True) try: b_hid = self.hid_space.get_origin() b_hid += init_bias_hid except ValueError: raise ValueError('bad shape or value for init_bias_hid') self.bias_hid = sharedX(b_hid, name='bias_hid', borrow=True) self.random_patches_src = random_patches_src self.register_names_to_del(['random_patches_src']) self.__dict__.update(nhid=nhid, nvis=nvis) self._params = safe_union(self.transformer.get_params(), [self.bias_vis, self.bias_hid]) self.base_lr = base_lr self.anneal_start = anneal_start self.nchains = nchains self.sml_gibbs_steps = sml_gibbs_steps def get_input_dim(self): if not isinstance(self.vis_space, VectorSpace): raise TypeError("Can't describe "+str(type(self.vis_space))+" as a dimensionality number.") return self.vis_space.dim def get_output_dim(self): if not isinstance(self.hid_space, VectorSpace): raise TypeError("Can't describe "+str(type(self.hid_space))+" as a dimensionality number.") return self.hid_space.dim def get_input_space(self): return self.vis_space def get_output_space(self): return self.hid_space def get_params(self): return [param for param in self._params] def get_weights(self, borrow=False): weights ,= self.transformer.get_params() return weights.get_value(borrow=borrow) def get_weights_topo(self): return self.transformer.get_weights_topo() def get_weights_format(self): return ['v', 'h'] def get_monitoring_channels(self, V, Y = None): theano_rng = RandomStreams(42) #TODO: re-enable this in the case where self.transformer #is a matrix multiply #norms = theano_norms(self.weights) H = self.mean_h_given_v(V) h = H.mean(axis=0) return { 'bias_hid_min' : T.min(self.bias_hid), 'bias_hid_mean' : T.mean(self.bias_hid), 'bias_hid_max' : T.max(self.bias_hid), 'bias_vis_min' : T.min(self.bias_vis), 'bias_vis_mean' : T.mean(self.bias_vis), 'bias_vis_max': T.max(self.bias_vis), 'h_min' : T.min(h), 'h_mean': T.mean(h), 'h_max' : T.max(h), #'W_min' : T.min(self.weights), #'W_max' : T.max(self.weights), #'W_norms_min' : T.min(norms), #'W_norms_max' : T.max(norms), #'W_norms_mean' : T.mean(norms), 'reconstruction_error' : self.reconstruction_error(V, theano_rng) } def ml_gradients(self, pos_v, neg_v): """ Get the contrastive gradients given positive and negative phase visible units. Parameters ---------- pos_v : tensor_like Theano symbolic representing a minibatch on the visible units, with the first dimension indexing training examples and the second indexing data dimensions (usually actual training data). neg_v : tensor_like Theano symbolic representing a minibatch on the visible units, with the first dimension indexing training examples and the second indexing data dimensions (usually reconstructions of the data or sampler particles from a persistent Markov chain). Returns ------- grads : list List of Theano symbolic variables representing gradients with respect to model parameters, in the same order as returned by `params()`. Notes ----- `pos_v` and `neg_v` need not have the same first dimension, i.e. minibatch size. """ # taking the mean over each term independently allows for different # mini-batch sizes in the positive and negative phase. ml_cost = (self.free_energy_given_v(pos_v).mean() - self.free_energy_given_v(neg_v).mean()) grads = tensor.grad(ml_cost, self.get_params(), consider_constant=[pos_v, neg_v]) return grads def train_batch(self, dataset, batch_size): """ A default learning rule based on SML """ self.learn_mini_batch(dataset.get_batch_design(batch_size)) return True def learn_mini_batch(self, X): """ A default learning rule based on SML """ if not hasattr(self, 'learn_func'): self.redo_theano() rval = self.learn_func(X) return rval def redo_theano(self): """ Compiles the theano function for the default learning rule """ init_names = dir(self) minibatch = tensor.matrix() optimizer = _SGDOptimizer(self, self.base_lr, self.anneal_start) sampler = sampler = BlockGibbsSampler(self, 0.5 + np.zeros((self.nchains, self.get_input_dim())), self.rng, steps= self.sml_gibbs_steps) updates = training_updates(visible_batch=minibatch, model=self, sampler=sampler, optimizer=optimizer) self.learn_func = theano.function([minibatch], updates=updates) final_names = dir(self) self.register_names_to_del([name for name in final_names if name not in init_names]) def gibbs_step_for_v(self, v, rng): """ Do a round of block Gibbs sampling given visible configuration Parameters ---------- v : tensor_like Theano symbolic representing the hidden unit states for a batch of training examples (or negative phase particles), with the first dimension indexing training examples and the second indexing data dimensions. rng : RandomStreams object Random number generator to use for sampling the hidden and visible units. Returns ------- v_sample : tensor_like Theano symbolic representing the new visible unit state after one round of Gibbs sampling. locals : dict Contains the following auxiliary state as keys (all symbolics except shape tuples): * `h_mean`: the returned value from `mean_h_given_v` * `h_mean_shape`: shape tuple indicating the size of `h_mean` and `h_sample` * `h_sample`: the stochastically sampled hidden units * `v_mean_shape`: shape tuple indicating the shape of `v_mean` and `v_sample` * `v_mean`: the returned value from `mean_v_given_h` * `v_sample`: the stochastically sampled visible units """ h_mean = self.mean_h_given_v(v) assert h_mean.type.dtype == v.type.dtype # For binary hidden units # TODO: factor further to extend to other kinds of hidden units # (e.g. spike-and-slab) h_sample = rng.binomial(size = h_mean.shape, n = 1 , p = h_mean, dtype=h_mean.type.dtype) assert h_sample.type.dtype == v.type.dtype # v_mean is always based on h_sample, not h_mean, because we don't # want h transmitting more than one bit of information per unit. v_mean = self.mean_v_given_h(h_sample) assert v_mean.type.dtype == v.type.dtype v_sample = self.sample_visibles([v_mean], v_mean.shape, rng) assert v_sample.type.dtype == v.type.dtype return v_sample, locals() def sample_visibles(self, params, shape, rng): """ Stochastically sample the visible units given hidden unit configurations for a set of training examples. Parameters ---------- params : list List of the necessary parameters to sample :math:`p(v|h)`. In the case of a binary-binary RBM this is a single-element list containing the symbolic representing :math:`p(v|h)`, as returned by `mean_v_given_h`. Returns ------- vprime : tensor_like Theano symbolic representing stochastic samples from :math:`p(v|h)` """ v_mean = params[0] return as_floatX(rng.uniform(size=shape) < v_mean) def input_to_h_from_v(self, v): """ Compute the affine function (linear map plus bias) that serves as input to the hidden layer in an RBM. Parameters ---------- v : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the one or several minibatches on the visible units, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- a : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the input to each hidden unit for each training example. """ if isinstance(v, tensor.Variable): return self.bias_hid + self.transformer.lmul(v) else: return [self.input_to_h_from_v(vis) for vis in v] def input_to_v_from_h(self, h): """ Compute the affine function (linear map plus bias) that serves as input to the visible layer in an RBM. Parameters ---------- h : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the one or several minibatches on the hidden units, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- a : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the input to each visible unit for each row of h. """ if isinstance(h, tensor.Variable): return self.bias_vis + self.transformer.lmul_T(h) else: return [self.input_to_v_from_h(hid) for hid in h] def mean_h_given_v(self, v): """ Compute the mean activation of the hidden units given visible unit configurations for a set of training examples. Parameters ---------- v : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the hidden unit states for a batch (or several) of training examples, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- h : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the mean (deterministic) hidden unit activations given the visible units. """ if isinstance(v, tensor.Variable): return nnet.sigmoid(self.input_to_h_from_v(v)) else: return [self.mean_h_given_v(vis) for vis in v] def mean_v_given_h(self, h): """ Compute the mean activation of the visibles given hidden unit configurations for a set of training examples. Parameters ---------- h : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the hidden unit states for a batch (or several) of training examples, with the first dimension indexing training examples and the second indexing hidden units. Returns ------- vprime : tensor_like or list of tensor_likes Theano symbolic (or list thereof) representing the mean (deterministic) reconstruction of the visible units given the hidden units. """ if isinstance(h, tensor.Variable): return nnet.sigmoid(self.input_to_v_from_h(h)) else: return [self.mean_v_given_h(hid) for hid in h] def free_energy_given_v(self, v): """ Calculate the free energy of a visible unit configuration by marginalizing over the hidden units. Parameters ---------- v : tensor_like Theano symbolic representing the hidden unit states for a batch of training examples, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- f : tensor_like 1-dimensional tensor (vector) representing the free energy associated with each row of v. """ sigmoid_arg = self.input_to_h_from_v(v) return (-tensor.dot(v, self.bias_vis) - nnet.softplus(sigmoid_arg).sum(axis=1)) def free_energy(self, V): return self.free_energy_given_v(V) def free_energy_given_h(self, h): """ Calculate the free energy of a hidden unit configuration by marginalizing over the visible units. Parameters ---------- h : tensor_like Theano symbolic representing the hidden unit states, with the first dimension indexing training examples and the second indexing data dimensions. Returns ------- f : tensor_like 1-dimensional tensor (vector) representing the free energy associated with each row of v. """ sigmoid_arg = self.input_to_v_from_h(h) return (-tensor.dot(h, self.bias_hid) - nnet.softplus(sigmoid_arg).sum(axis=1)) def __call__(self, v): """ Forward propagate (symbolic) input through this module, obtaining a representation to pass on to layers above. This just aliases the `mean_h_given_v()` function for syntactic sugar/convenience. """ return self.mean_h_given_v(v) def reconstruction_error(self, v, rng): """ Compute the mean-squared error (mean over examples, sum over units) across a minibatch after a Gibbs step starting from the training data. Parameters ---------- v : tensor_like Theano symbolic representing the hidden unit states for a batch of training examples, with the first dimension indexing training examples and the second indexing data dimensions. rng : RandomStreams object Random number generator to use for sampling the hidden and visible units. Returns ------- mse : tensor_like 0-dimensional tensor (essentially a scalar) indicating the mean reconstruction error across the minibatch. Notes ----- The reconstruction used to assess error samples only the hidden units. For the visible units, it uses the conditional mean. No sampling of the visible units is done, to reduce noise in the estimate. """ sample, _locals = self.gibbs_step_for_v(v, rng) return ((_locals['v_mean'] - v) ** 2).sum(axis=1).mean()
class IsingHidden(HiddenLayer): """ A hidden layer with h being a vector in {-1, 1}^dim, implementing the energy function term -v^T Wh -b^T h where W and b are parameters of this layer, and v is the upward state of the layer below """ def __init__(self, dim, layer_name, irange = None, sparse_init = None, sparse_stdev = 1., include_prob = 1.0, init_bias = 0., W_lr_scale = None, b_lr_scale = None, max_col_norm = None): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX( np.zeros((self.dim,)) + init_bias, name = layer_name + '_b') def get_lr_scalers(self): if not hasattr(self, 'W_lr_scale'): self.W_lr_scale = None if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None rval = OrderedDict() if self.W_lr_scale is not None: W, = self.transformer.get_params() rval[W] = self.W_lr_scale if self.b_lr_scale is not None: rval[self.b] = self.b_lr_scale return rval def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) self.output_space = VectorSpace(self.dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.dim)) W *= self.sparse_stdev W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W ,= self.transformer.get_params() assert W.name is not None def censor_updates(self, updates): if self.max_col_norm is not None: W, = self.transformer.get_params() if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) def get_total_state_space(self): return VectorSpace(self.dim) def get_params(self): assert self.b.name is not None W ,= self.transformer.get_params() assert W.name is not None rval = self.transformer.get_params() assert not isinstance(rval, set) rval = list(rval) assert self.b not in rval rval.append(self.b) return rval def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W ,= self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W ,= self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases, recenter = False): self.b.set_value(biases) if recenter: assert self.center if self.pool_size != 1: raise NotImplementedError() self.offset.set_value(sigmoid_numpy(self.b.get_value())) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W ,= self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): return total_state def downward_state(self, total_state): return total_state def get_monitoring_channels(self): W ,= self.transformer.get_params() assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ]) def get_monitoring_channels_from_state(self, state): P = state rval = OrderedDict() vars_and_prefixes = [ (P,'') ] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min # max_x.mean_u is "the mean over *u*nits of the max over e*x*amples" # The x and u are included in the name because otherwise its hard # to remember which axis is which when reading the monitor # I use inner.outer rather than outer_of_inner or something like that # because I want mean_x.* to appear next to each other in the alphabetical # list, as these are commonly plotted together for key, val in [ ('max_x.max_u', v_max.max()), ('max_x.mean_u', v_max.mean()), ('max_x.min_u', v_max.min()), ('min_x.max_u', v_min.max()), ('min_x.mean_u', v_min.mean()), ('min_x.min_u', v_min.min()), ('range_x.max_u', v_range.max()), ('range_x.mean_u', v_range.mean()), ('range_x.min_u', v_range.min()), ('mean_x.max_u', v_mean.max()), ('mean_x.mean_u', v_mean.mean()), ('mean_x.min_u', v_mean.min()) ]: rval[prefix+key] = val return rval def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b if msg != None: z = z + msg on_prob = T.nnet.sigmoid(2. * z) samples = theano_rng.binomial(p = on_prob, n=1, size=on_prob.shape, dtype=on_prob.dtype) * 2. - 1. return samples def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def init_mf_state(self): raise NotImplementedError("This is just a copy-paste of BVMP") # work around theano bug with broadcasted vectors z = T.alloc(0., self.dbm.batch_size, self.detector_layer_dim).astype(self.b.dtype) + \ self.b.dimshuffle('x', 0) rval = max_pool_channels(z = z, pool_size = self.pool_size) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ driver = numpy_rng.uniform(0.,1., (num_examples, self.dim)) on_prob = sigmoid_numpy(2. * self.b.get_value()) sample = 2. * (driver < on_prob) - 1. rval = sharedX(sample, name = 'v_sample_shared') return rval def make_symbolic_state(self, num_examples, theano_rng): mean = T.nnet.sigmoid(2. * self.b) rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean) rval = 2. * (rval) - 1. return rval def expected_energy_term(self, state, average, state_below, average_below): # state = Print('h_state', attrs=['min', 'max'])(state) self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.b) weights_term = (self.transformer.lmul(state_below) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def linear_feed_forward_approximation(self, state_below): """ Used to implement TorontoSparsity. Unclear exactly what properties of it are important or how to implement it for other layers. Properties it must have: output is same kind of data structure (ie, tuple of theano 2-tensors) as mf_update Properties it probably should have for other layer types: An infinitesimal change in state_below or the parameters should cause the same sign of change in the output of linear_feed_forward_approximation and in mf_update Should not have any non-linearities that cause the gradient to shrink Should disregard top-down feedback """ z = self.transformer.lmul(state_below) + self.b if self.pool_size != 1: # Should probably implement sum pooling for the non-pooled version, # but in reality it's not totally clear what the right answer is raise NotImplementedError() return z, z def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_'+iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' if msg is not None: z = z + msg h = T.tanh(z) return h
class BinaryVectorMaxPool(HiddenLayer): """ A hidden layer that does max-pooling on binary vectors. It has two sublayers, the detector layer and the pooling layer. The detector layer is its downward state and the pooling layer is its upward state. TODO: this layer uses (pooled, detector) as its total state, which can be confusing when listing all the states in the network left to right. Change this and pylearn2.expr.probabilistic_max_pooling to use (detector, pooled) """ def __init__(self, detector_layer_dim, pool_size, layer_name, irange = None, sparse_init = None, include_prob = 1.0, init_bias = 0.): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX( np.zeros((self.detector_layer_dim,)) + init_bias, name = layer_name + '_b') def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if not (self.detector_layer_dim % self.pool_size == 0): raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" % (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size)) self.h_space = VectorSpace(self.detector_layer_dim) self.pool_layer_dim = self.detector_layer_dim / self.pool_size self.output_space = VectorSpace(self.pool_layer_dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.detector_layer_dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.detector_layer_dim)) for i in xrange(self.detector_layer_dim): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W ,= self.transformer.get_params() assert W.name is not None def get_total_state_space(self): return CompositeSpace((self.output_space, self.h_space)) def get_params(self): assert self.b.name is not None W ,= self.transformer.get_params() assert W.name is not None return self.transformer.get_params().union([self.b]) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) W ,= self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W ,= self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_view_shape(self): total = self.detector_layer_dim cols = self.pool_size if cols == 1: # Let the PatchViewer decidew how to arrange the units # when they're not pooled raise NotImplementedError() # When they are pooled, make each pooling unit have one row rows = total / cols return rows, cols def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W ,= self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): p,h = total_state self.h_space.validate(h) self.output_space.validate(p) return p def downward_state(self, total_state): p,h = total_state return h def get_monitoring_channels_from_state(self, state): P, H = state rval ={} if self.pool_size == 1: vars_and_prefixes = [ (P,'') ] else: vars_and_prefixes = [ (P, 'p_'), (H, 'h_') ] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min for key, val in [ ('max_max', v_max.max()), ('max_mean', v_max.mean()), ('max_min', v_max.min()), ('min_max', v_min.max()), ('min_mean', v_min.mean()), ('min_max', v_min.max()), ('range_max', v_range.max()), ('range_mean', v_range.mean()), ('range_min', v_range.min()), ('mean_max', v_mean.max()), ('mean_mean', v_mean.mean()), ('mean_min', v_mean.min()) ]: rval[prefix+key] = val return rval def get_l1_act_cost(self, state, target, coeff, eps = None): rval = 0. P, H = state self.output_space.validate(P) self.h_space.validate(H) if self.pool_size == 1: # If the pool size is 1 then pools = detectors # and we should not penalize pools and detectors separately assert len(state) == 2 assert isinstance(target, float) assert isinstance(coeff, float) _, state = state state = [state] target = [target] coeff = [coeff] if eps is None: eps = [0.] else: eps = [eps] else: assert all([len(elem) == 2 for elem in [state, target, coeff]]) if eps is None: eps = [0., 0.] if target[1] < target[0]: warnings.warn("Do you really want to regularize the detector units to be sparser than the pooling units?") for s, t, c, e in safe_zip(state, target, coeff, eps): assert all([isinstance(elem, float) for elem in [t, c, e]]) if c == 0.: continue m = s.mean(axis=0) assert m.ndim == 1 rval += T.maximum(abs(m-t)-e,0.).mean()*c return rval def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg, theano_rng) return p_sample, h_sample def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.h_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16)) p_exp, h_exp, p_sample, h_sample = max_pool_channels( z = default_z, pool_size = self.pool_size, theano_rng = theano_rng) assert h_sample.dtype == default_z.dtype p_state = sharedX( self.output_space.get_origin_batch( num_examples)) t2 = time.time() f = function([], updates = { p_state : p_sample, h_state : h_sample }) t3 = time.time() f() t4 = time.time() print str(self)+'.make_state took',t4-t1 print '\tcompose time:',t2-t1 print '\tcompile time:',t3-t2 print '\texecute time:',t4-t3 p_state.name = 'p_sample_shared' h_state.name = 'h_sample_shared' return p_state, h_state def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) downward_state = self.downward_state(state) self.h_space.validate(downward_state) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(downward_state, self.b) weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_'+iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' p,h = max_pool_channels(z, self.pool_size, msg) p.name = self.layer_name + '_p_' + iter_name h.name = self.layer_name + '_h_' + iter_name return p, h