def test_vector_to_conv_c01b_invertible(): """ Tests that the format_as methods between Conv2DSpace and VectorSpace are invertible for the ('c', 0, 1, 'b') axis format. """ rng = np.random.RandomState([2013, 5, 1]) batch_size = 3 rows = 4 cols = 5 channels = 2 conv = Conv2DSpace([rows, cols], channels = channels, axes = ('c', 0, 1, 'b')) vec = VectorSpace(conv.get_total_dimension()) X = conv.make_batch_theano() Y = conv.format_as(X, vec) Z = vec.format_as(Y, conv) A = vec.make_batch_theano() B = vec.format_as(A, conv) C = conv.format_as(B, vec) f = function([X, A], [Z, C]) X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype) A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype) Z, C = f(X,A) np.testing.assert_allclose(Z, X) np.testing.assert_allclose(C, A)
def test_vector_to_conv_c01b_invertible(): """ Tests that the format_as methods between Conv2DSpace and VectorSpace are invertible for the ('c', 0, 1, 'b') axis format. """ rng = np.random.RandomState([2013, 5, 1]) batch_size = 3 rows = 4 cols = 5 channels = 2 conv = Conv2DSpace([rows, cols], channels=channels, axes=('c', 0, 1, 'b')) vec = VectorSpace(conv.get_total_dimension()) X = conv.make_batch_theano() Y = conv.format_as(X, vec) Z = vec.format_as(Y, conv) A = vec.make_batch_theano() B = vec.format_as(A, conv) C = conv.format_as(B, vec) f = function([X, A], [Z, C]) X = rng.randn(*(conv.get_origin_batch(batch_size).shape)).astype(X.dtype) A = rng.randn(*(vec.get_origin_batch(batch_size).shape)).astype(A.dtype) Z, C = f(X, A) np.testing.assert_allclose(Z, X) np.testing.assert_allclose(C, A)
def inv_prop(self, state_above): if not isinstance(state_above, tuple): expected_space = VectorSpace(self.output_space.get_total_dimension()) state_above = expected_space.format_as(state_above, self.output_space) self.output_space.validate(state_above) return tuple(layer.inv_prop(state) for layer,state in safe_zip(self.layers, state_above))
def simulate(inputs, model): space = VectorSpace(inputs.shape[1]) X = space.get_theano_batch() Y = model.fprop(space.format_as(X, model.get_input_space())) f = theano.function([X], Y) result = [] for x in xrange(0, len(inputs), 100): result.extend(f(inputs[x:x + 100])) return result
def inv_prop(self, state_above): if not isinstance(state_above, tuple): expected_space = VectorSpace( self.output_space.get_total_dimension()) state_above = expected_space.format_as(state_above, self.output_space) self.output_space.validate(state_above) return tuple( layer.inv_prop(state) for layer, state in safe_zip(self.layers, state_above))
class VectorSpaceConverter(mlp.Layer): def __init__(self, layer_name): self.layer_name = layer_name self._params = [] def set_input_space(self, space): self.input_space = space self.output_space = VectorSpace(space.get_total_dimension()) def fprop(self, state_below): return self.input_space.format_as(state_below, self.output_space) def inv_prop(self, state_above): return self.output_space.format_as(state_above, self.input_space) def get_weight_decay(self, coeff): return 0.0 def get_l1_weight_decay(self, coeff): return 0.0
class Softmax(HiddenLayer): def __init__(self, n_classes, layer_name, irange=None, sparse_init=None, W_lr_scale=None): if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, int) self.output_space = VectorSpace(n_classes) self.b = sharedX(np.zeros((n_classes, )), name='softmax_b') def get_lr_scalers(self): rval = {} # Patch old pickle files if not hasattr(self, 'W_lr_scale'): self.W_lr_scale = None if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale return rval def get_total_state_space(self): return self.output_space def get_monitoring_channels_from_state(self, state): mx = state.max(axis=1) return { 'mean_max_class': mx.mean(), 'max_max_class': mx.max(), 'min_max_class': mx.min() } def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got " + str(space) + " of type " + str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) self.desired_space = VectorSpace(self.input_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.n_classes)) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'softmax_W') self._params = [self.b, self.W] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def sample(self, state_below=None, state_above=None, layer_above=None, theano_rng=None): if state_above is not None: # If you implement this case, also add a unit test for it. # Or at least add a warning that it is not tested. raise NotImplementedError() if theano_rng is None: raise ValueError( "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list." ) self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) z = T.dot(state_below, self.W) + self.b h_exp = T.nnet.softmax(z) h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype) return h_sample def mf_update(self, state_below, state_above=None, layer_above=None, double_weights=False, iter_name=None): if state_above is not None: raise NotImplementedError() if double_weights: raise NotImplementedError() self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) """ from pylearn2.utils import serial X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl') state_below = Verify(X,'features')(state_below) """ assert self.W.ndim == 2 assert state_below.ndim == 2 b = self.b Z = T.dot(state_below, self.W) + b #Z = Print('Z')(Z) rval = T.nnet.softmax(Z) return rval def downward_message(self, downward_state): rval = T.dot(downward_state, self.W.T) rval = self.desired_space.format_as(rval, self.input_space) return rval def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale): """ scale is because the visible layer also goes into the cost. it uses the mean over units and examples, so that the scale of the cost doesn't change too much with batch size or example size. we need to multiply this cost by scale to make sure that it is put on the same scale as the reconstruction cost for the visible units. ie, scale should be 1/nvis """ Y_hat = Y_hat_unmasked assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z, = owner.inputs assert z.ndim == 2 z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x') # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) masked = log_prob_of * drop_mask_Y assert masked.ndim == 1 rval = masked.mean() * scale return -rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.output_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16)) h_exp = T.nnet.softmax(default_z) h_sample = theano_rng.multinomial(pvals=h_exp, dtype=h_exp.dtype) p_state = sharedX(self.output_space.get_origin_batch(num_examples)) t2 = time.time() f = function([], updates={h_state: h_sample}) t3 = time.time() f() t4 = time.time() print str(self) + '.make_state took', t4 - t1 print '\tcompose time:', t2 - t1 print '\tcompile time:', t3 - t2 print '\texecute time:', t4 - t3 h_state.name = 'softmax_sample_shared' return h_state def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) return coeff * T.sqr(self.W).sum() def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.b) weights_term = (T.dot(state_below, self.W) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval
class ConditionalGenerator(Generator): def __init__(self, mlp, input_condition_space, condition_distribution, noise_dim=100, *args, **kwargs): super(ConditionalGenerator, self).__init__(mlp, *args, **kwargs) self.noise_dim = noise_dim self.noise_space = VectorSpace(dim=self.noise_dim) self.condition_space = input_condition_space self.condition_distribution = condition_distribution self.input_space = CompositeSpace([self.noise_space, self.condition_space]) self.mlp.set_input_space(self.input_space) def sample_and_noise( self, conditional_data, default_input_include_prob=1.0, default_input_scale=1.0, all_g_layers=False ): """ Retrieve a sample (and the noise used to generate the sample) conditioned on some input data. Parameters ---------- conditional_data: member of self.condition_space A minibatch of conditional data to feedforward. default_input_include_prob: float WRITEME default_input_scale: float WRITEME all_g_layers: boolean If true, return all generator layers in `other_layers` slot of this method's return value. (Otherwise returns `None` in this slot.) Returns ------- net_output: 3-tuple Tuple of the form `(sample, noise, other_layers)`. """ if isinstance(conditional_data, int): conditional_data = self.condition_distribution.sample(conditional_data) num_samples = conditional_data.shape[0] noise = self.get_noise((num_samples, self.noise_dim)) # TODO necessary? formatted_noise = self.noise_space.format_as(noise, self.noise_space) # Build inputs: concatenate noise with conditional data inputs = (formatted_noise, conditional_data) # Feedforward # if all_g_layers: # rval = self.mlp.dropout_fprop(inputs, default_input_include_prob=default_input_include_prob, # default_input_scale=default_input_scale, return_all=all_g_layers) # other_layers, rval = rval[:-1], rval[-1] # else: rval = self.mlp.dropout_fprop( inputs, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale ) # other_layers = None return rval, formatted_noise, conditional_data, None # , other_layers def sample(self, conditional_data, **kwargs): sample, _, _, _ = self.sample_and_noise(conditional_data, **kwargs) return sample def get_monitoring_channels(self, data): if data is None: m = 100 conditional_data = self.condition_distribution.sample(m) else: _, conditional_data = data m = conditional_data.shape[0] noise = self.get_noise((m, self.noise_dim)) rval = OrderedDict() sampled_data = (noise, conditional_data) try: rval.update(self.mlp.get_monitoring_channels((sampled_data, None))) except Exception: warnings.warn("something went wrong with generator.mlp's monitoring channels") if self.monitor_ll: rval["ll"] = T.cast(self.ll(data, self.ll_n_samples, self.ll_sigma), theano.config.floatX).mean() rval["nll"] = -rval["ll"] return rval def ll(self, data, n_samples, sigma): real_data, conditional_data = data sampled_data = self.sample(conditional_data) output_space = self.mlp.get_output_space() if "Conv2D" in str(output_space): samples = output_space.convert(sampled_data, output_space.axes, ("b", 0, 1, "c")) samples = samples.flatten(2) data = output_space.convert(real_data, output_space.axes, ("b", 0, 1, "c")) data = data.flatten(2) parzen = theano_parzen(data, samples, sigma) return parzen
class Softmax(Layer): def __init__(self, n_classes, layer_name, irange = None, istdev = None, sparse_init = None, W_lr_scale = None, b_lr_scale = None, max_row_norm = None): """ """ if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, int) self.output_space = VectorSpace(n_classes) self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b') def get_lr_scalers(self): rval = OrderedDict() if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None if self.b_lr_scale is not None: assert isinstance(self.b_lr_scale, float) rval[self.b] = self.b_lr_scale return rval def get_monitoring_channels_from_state(self, state, target=None): mx = state.max(axis=1) rval = OrderedDict([ ('mean_max_class' , mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ]) if target is not None: y_hat = T.argmax(state, axis=1) y = T.argmax(target, axis=1) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass return rval def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) self.desired_space = VectorSpace(self.input_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim, self.n_classes) * self.istdev else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'softmax_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert self.W.ndim == 2 assert state_below.ndim == 2 b = self.b Z = T.dot(state_below, self.W) + b rval = T.nnet.softmax(Z) for value in get_debug_values(rval): assert value.shape[0] == self.mlp.batch_size return rval def cost(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a softmax estimate. of Y. Returns negative log probability of Y under the Y_hat distribution. """ assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z ,= owner.inputs assert z.ndim == 2 z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.log(T.exp(z).sum(axis=1).dimshuffle(0, 'x')) # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) assert log_prob_of.ndim == 1 rval = log_prob_of.mean() return - rval def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() def censor_updates(self, updates): if self.max_row_norm is not None: W = self.W if W in updates: updated_W = updates[W] row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1)) desired_norms = T.clip(row_norms, 0, self.max_row_norm) updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle(0, 'x')
class ConditionalGenerator(Generator): def __init__(self, mlp, input_condition_space, condition_distribution, noise_dim=100, *args, **kwargs): super(ConditionalGenerator, self).__init__(mlp, *args, **kwargs) self.noise_dim = noise_dim self.noise_space = VectorSpace(dim=self.noise_dim) self.condition_space = input_condition_space self.condition_distribution = condition_distribution self.input_space = CompositeSpace( [self.noise_space, self.condition_space]) self.mlp.set_input_space(self.input_space) def sample_and_noise(self, conditional_data, default_input_include_prob=1., default_input_scale=1., all_g_layers=False): """ Retrieve a sample (and the noise used to generate the sample) conditioned on some input data. Parameters ---------- conditional_data: member of self.condition_space A minibatch of conditional data to feedforward. default_input_include_prob: float WRITEME default_input_scale: float WRITEME all_g_layers: boolean If true, return all generator layers in `other_layers` slot of this method's return value. (Otherwise returns `None` in this slot.) Returns ------- net_output: 3-tuple Tuple of the form `(sample, noise, other_layers)`. """ if isinstance(conditional_data, int): conditional_data = self.condition_distribution.sample( conditional_data) num_samples = conditional_data.shape[0] noise = self.get_noise((num_samples, self.noise_dim)) # TODO necessary? formatted_noise = self.noise_space.format_as(noise, self.noise_space) # Build inputs: concatenate noise with conditional data inputs = (formatted_noise, conditional_data) # Feedforward # if all_g_layers: # rval = self.mlp.dropout_fprop(inputs, default_input_include_prob=default_input_include_prob, # default_input_scale=default_input_scale, return_all=all_g_layers) # other_layers, rval = rval[:-1], rval[-1] # else: rval = self.mlp.dropout_fprop( inputs, default_input_include_prob=default_input_include_prob, default_input_scale=default_input_scale) # other_layers = None return rval, formatted_noise, conditional_data, None # , other_layers def sample(self, conditional_data, **kwargs): sample, _, _, _ = self.sample_and_noise(conditional_data, **kwargs) return sample def get_monitoring_channels(self, data): if data is None: m = 100 conditional_data = self.condition_distribution.sample(m) else: _, conditional_data = data m = conditional_data.shape[0] noise = self.get_noise((m, self.noise_dim)) rval = OrderedDict() sampled_data = (noise, conditional_data) try: rval.update(self.mlp.get_monitoring_channels((sampled_data, None))) except Exception: warnings.warn( "something went wrong with generator.mlp's monitoring channels" ) if self.monitor_ll: rval['ll'] = T.cast( self.ll(data, self.ll_n_samples, self.ll_sigma), theano.config.floatX).mean() rval['nll'] = -rval['ll'] return rval def ll(self, data, n_samples, sigma): real_data, conditional_data = data sampled_data = self.sample(conditional_data) output_space = self.mlp.get_output_space() if 'Conv2D' in str(output_space): samples = output_space.convert(sampled_data, output_space.axes, ('b', 0, 1, 'c')) samples = samples.flatten(2) data = output_space.convert(real_data, output_space.axes, ('b', 0, 1, 'c')) data = data.flatten(2) parzen = theano_parzen(data, samples, sigma) return parzen
class MultiSoftmax(Layer): def __init__(self, n_groups, n_classes, layer_name, irange = None, istdev = None, sparse_init = None, W_lr_scale = None, b_lr_scale = None, max_row_norm = None, no_affine = False, max_col_norm = None): """ """ if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, py_integer_types) self.output_space = MatrixSpace(n_groups, n_classes) self.b = sharedX( np.zeros((n_groups, n_classes,)), name = 'softmax_b') def get_lr_scalers(self): rval = OrderedDict() if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None if self.b_lr_scale is not None: assert isinstance(self.b_lr_scale, float) rval[self.b] = self.b_lr_scale return rval def get_monitoring_channels(self): return OrderedDict() def get_monitoring_channels_from_state(self, state, target=None): return OrderedDict() def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) if self.no_affine: desired_dim = self.n_classes assert self.input_dim == desired_dim else: desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_groups,self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim,self.n_groups,self.n_classes) * self.istdev else: raise NotImplementedError() self.W = sharedX(W, 'softmax_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 3 Z = T.tensordot(state_below, self.W, axes=[[1],[0]]) + self.b rval = batched_softmax(Z) for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval def cost(self, Y, Y_hat): return self.cost_from_cost_matrix(self.cost_matrix(Y, Y_hat)) def cost_from_cost_matrix(self, cost_matrix): return cost_matrix.sum(axis=2).mean() def cost_matrix(self, Y, Y_hat): return -Y * T.log(Y_hat) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W = self.W return coeff * abs(W).sum() def censor_updates(self, updates): return if self.max_row_norm is not None: W = self.W if W in updates: updated_W = updates[W] row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1)) desired_norms = T.clip(row_norms, 0, self.max_row_norm) updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle(0, 'x') if self.max_col_norm is not None: assert self.max_row_norm is None W = self.W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
class BoltzmannIsingHidden(HiddenLayer): """ A hidden layer with h being a vector in {-1, 1}^dim, implementing the energy function term -v^T Wh -b^T h where W and b are parameters of this layer, and v is the upward state of the layer below """ def __init__(self, dim, layer_name, layer_below, irange = None, sparse_init = None, sparse_stdev = 1., include_prob = 1.0, init_bias = 0., W_lr_scale = None, b_lr_scale = None, max_col_norm = None, min_ising_b = None, max_ising_b = None, min_ising_W = None, max_ising_W = None, sampling_W_stdev = None, sampling_b_stdev = None): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.boltzmann_b = sharedX( np.zeros((self.dim,)) + init_bias, name = layer_name + '_b') layer_below.layer_above = self def get_lr_scalers(self): if not hasattr(self, 'W_lr_scale'): self.W_lr_scale = None if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None rval = OrderedDict() if self.W_lr_scale is not None: W = self.W rval[W] = self.W_lr_scale if self.b_lr_scale is not None: rval[self.boltzmann_b] = self.b_lr_scale return rval def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) self.output_space = VectorSpace(self.dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.dim)) W *= self.sparse_stdev W = sharedX(W) W.name = self.layer_name + '_W' self.W = W if self.sampling_b_stdev is not None: self.noisy_sampling_b = sharedX(np.zeros((self.dbm.batch_size, self.dim))) self.layer_below.noisy_sampling_b = sharedX(np.zeros((self.dbm.batch_size, self.layer_below.nvis))) if self.sampling_W_stdev is not None: self.noisy_sampling_W = sharedX(np.zeros((self.input_dim, self.dim)), 'noisy_sampling_W') updates = OrderedDict() updates[self.boltzmann_b] = self.boltzmann_b updates[self.W] = self.W updates[self.layer_below.boltzmann_bias] = self.layer_below.boltzmann_bias self.censor_updates(updates) f = function([], updates=updates) f() def censor_updates(self, updates): if self.max_col_norm is not None: W = self.W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) if any(constraint is not None for constraint in [self.min_ising_b, self.max_ising_b, self.min_ising_W, self.max_ising_W]): assert not hasattr(self.layer_below, 'layer_below') bmn = self.min_ising_b if bmn is None: bmn = - 1e6 bmx = self.max_ising_b if bmx is None: bmx = 1e6 wmn = self.min_ising_W if wmn is None: wmn = - 1e6 wmx = self.max_ising_W if wmx is None: wmx = 1e6 W = updates[self.W] ising_W = 0.25 * W ising_W = T.clip(ising_W, wmn, wmx) bv = updates[self.layer_below.boltzmann_bias] ising_bv = 0.5 * bv + 0.25 * W.sum(axis=1) ising_bv = T.clip(ising_bv, bmn, bmx) bh = updates[self.boltzmann_b] ising_bh = 0.5 * bh + 0.25 * W.sum(axis=0) ising_bh = T.clip(ising_bh, bmn, bmx) Wn = 4. * ising_W bvn = 2. * (ising_bv - ising_W.sum(axis=1)) bhn = 2. * (ising_bh - ising_W.sum(axis=0)) updates[self.W] = Wn updates[self.layer_below.boltzmann_bias] = bvn updates[self.boltzmann_b] = bhn if self.noisy_sampling_W is not None: theano_rng = MRG_RandomStreams(self.dbm.rng.randint(2**16)) bmn = self.min_ising_b if bmn is None: bmn = - 1e6 bmx = self.max_ising_b if bmx is None: bmx = 1e6 wmn = self.min_ising_W if wmn is None: wmn = - 1e6 wmx = self.max_ising_W if wmx is None: wmx = 1e6 W = updates[self.W] ising_W = 0.25 * W noisy_sampling_W = theano_rng.normal(avg=ising_W, std=self.sampling_W_stdev, size=ising_W.shape, dtype=ising_W.dtype) updates[self.noisy_sampling_W] = noisy_sampling_W bv = updates[self.layer_below.boltzmann_bias] ising_bv = 0.5 * bv + 0.25 * W.sum(axis=1) noisy_sampling_bv = theano_rng.normal(avg=ising_bv.dimshuffle('x', 0), std=self.sampling_b_stdev, size=self.layer_below.noisy_sampling_b.shape, dtype=ising_bv.dtype) updates[self.layer_below.noisy_sampling_b] = noisy_sampling_bv bh = updates[self.boltzmann_b] ising_bh = 0.5 * bh + 0.25 * W.sum(axis=0) noisy_sampling_bh = theano_rng.normal(avg=ising_bh.dimshuffle('x', 0), std=self.sampling_b_stdev, size = self.noisy_sampling_b.shape, dtype=ising_bh.dtype) updates[self.noisy_sampling_b] = noisy_sampling_bh def get_total_state_space(self): return VectorSpace(self.dim) def get_params(self): assert self.boltzmann_b.name is not None W = self.W assert W.name is not None rval = [W] assert not isinstance(rval, set) rval = list(rval) assert self.boltzmann_b not in rval rval.append(self.boltzmann_b) return rval def ising_weights(self, for_sampling=False): if not hasattr(self, 'sampling_W_stdev'): self.sampling_W_stdev = None if for_sampling and self.sampling_W_stdev is not None: return self.noisy_sampling_W return 0.25 * self.W def ising_b(self, for_sampling=False): if hasattr(self, 'layer_above'): raise NotImplementedError() if not hasattr(self, 'sampling_b_stdev'): self.sampling_b_stdev = None if for_sampling and self.sampling_b_stdev is not None: return self.noisy_sampling_b return 0.5 * self.boltzmann_b + 0.25 * self.W.sum(axis=0) def ising_b_numpy(self): if hasattr(self, 'layer_above'): raise NotImplementedError() return 0.5 * self.boltzmann_b.get_value() + 0.25 * self.W.get_value().sum(axis=0) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W = self.W return coeff * T.sqr(W).sum() def get_weights(self): warnings.warn("BoltzmannIsingHidden.get_weights returns the BOLTZMANN weights, is that what we want?") W = self.W return W.get_value() def set_weights(self, weights): warnings.warn("BoltzmannIsingHidden.set_weights sets the BOLTZMANN weights, is that what we want?") W = self.W W.set_value(weights) def set_biases(self, biases, recenter = False): assert False # not really sure what this should do def get_biases(self): assert False # not really sure what this should do def get_weights_format(self): return ('v', 'h') def get_weights_topo(self): warnings.warn("BoltzmannIsingHidden.get_weights_topo returns the BOLTZMANN weights, is that what we want?") if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W = self.W W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): return total_state def downward_state(self, total_state): return total_state def get_monitoring_channels(self): W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) rval = OrderedDict([ ('boltzmann_row_norms_min' , row_norms.min()), ('boltzmann_row_norms_mean' , row_norms.mean()), ('boltzmann_row_norms_max' , row_norms.max()), ('boltzmann_col_norms_min' , col_norms.min()), ('boltzmann_col_norms_mean' , col_norms.mean()), ('boltzmann_col_norms_max' , col_norms.max()), ]) ising_W = self.ising_weights() rval['ising_W_min'] = ising_W.min() rval['ising_W_max'] = ising_W.max() ising_b = self.ising_b() rval['ising_b_min'] = ising_b.min() rval['ising_b_max'] = ising_b.max() if hasattr(self, 'noisy_sampling_W'): rval['noisy_sampling_W_min'] = self.noisy_sampling_W.min() rval['noisy_sampling_W_max'] = self.noisy_sampling_W.max() rval['noisy_sampling_b_min'] = self.noisy_sampling_b.min() rval['noisy_sampling_b_max'] = self.noisy_sampling_b.max() return rval def get_monitoring_channels_from_state(self, state): P = state rval = OrderedDict() vars_and_prefixes = [ (P,'') ] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min # max_x.mean_u is "the mean over *u*nits of the max over e*x*amples" # The x and u are included in the name because otherwise its hard # to remember which axis is which when reading the monitor # I use inner.outer rather than outer_of_inner or something like that # because I want mean_x.* to appear next to each other in the alphabetical # list, as these are commonly plotted together for key, val in [ ('max_x.max_u', v_max.max()), ('max_x.mean_u', v_max.mean()), ('max_x.min_u', v_max.min()), ('min_x.max_u', v_min.max()), ('min_x.mean_u', v_min.mean()), ('min_x.min_u', v_min.min()), ('range_x.max_u', v_range.max()), ('range_x.mean_u', v_range.mean()), ('range_x.min_u', v_range.min()), ('mean_x.max_u', v_mean.max()), ('mean_x.mean_u', v_mean.mean()), ('mean_x.min_u', v_mean.min()) ]: rval[prefix+key] = val return rval def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") if state_above is not None: msg = layer_above.downward_message(state_above, for_sampling=True) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = T.dot(state_below, self.ising_weights(for_sampling=True)) + self.ising_b(for_sampling=True) if msg != None: z = z + msg on_prob = T.nnet.sigmoid(2. * z) samples = theano_rng.binomial(p = on_prob, n=1, size=on_prob.shape, dtype=on_prob.dtype) * 2. - 1. return samples def downward_message(self, downward_state, for_sampling=False): rval = T.dot(downward_state, self.ising_weights(for_sampling=False).T) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def init_mf_state(self): raise NotImplementedError("This is just a copy-paste of BVMP") # work around theano bug with broadcasted vectors z = T.alloc(0., self.dbm.batch_size, self.detector_layer_dim).astype(self.boltzmann_b.dtype) + \ self.ising_b().dimshuffle('x', 0) rval = max_pool_channels(z = z, pool_size = self.pool_size) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ driver = numpy_rng.uniform(0.,1., (num_examples, self.dim)) on_prob = sigmoid_numpy(2. * self.ising_b_numpy()) sample = 2. * (driver < on_prob) - 1. rval = sharedX(sample, name = 'v_sample_shared') return rval def make_symbolic_state(self, num_examples, theano_rng): mean = T.nnet.sigmoid(2. * self.ising_b()) rval = theano_rng.binomial(size=(num_examples, self.nvis), p=mean) rval = 2. * (rval) - 1. return rval def expected_energy_term(self, state, average, state_below, average_below): # state = Print('h_state', attrs=['min', 'max'])(state) self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.ising_b()) weights_term = (T.dot(state_below, self.ising_weights()) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def linear_feed_forward_approximation(self, state_below): """ Used to implement TorontoSparsity. Unclear exactly what properties of it are important or how to implement it for other layers. Properties it must have: output is same kind of data structure (ie, tuple of theano 2-tensors) as mf_update Properties it probably should have for other layer types: An infinitesimal change in state_below or the parameters should cause the same sign of change in the output of linear_feed_forward_approximation and in mf_update Should not have any non-linearities that cause the gradient to shrink Should disregard top-down feedback """ z = T.dot(state_below, self.ising_weights()) + self.ising_b() return z def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_'+iter_name + '_2state' z = T.dot(state_below, self.ising_weights()) + self.ising_b() if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' if msg is not None: z = z + msg h = T.tanh(z) return h def get_l2_act_cost(self, state, target, coeff): avg = state.mean(axis=0) diff = avg - target return coeff * T.sqr(diff).mean()
class BinaryVectorMaxPool(HiddenLayer): """ A hidden layer that does max-pooling on binary vectors. It has two sublayers, the detector layer and the pooling layer. The detector layer is its downward state and the pooling layer is its upward state. TODO: this layer uses (pooled, detector) as its total state, which can be confusing when listing all the states in the network left to right. Change this and pylearn2.expr.probabilistic_max_pooling to use (detector, pooled) """ def __init__(self, detector_layer_dim, pool_size, layer_name, irange = None, sparse_init = None, include_prob = 1.0, init_bias = 0.): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX( np.zeros((self.detector_layer_dim,)) + init_bias, name = layer_name + '_b') def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if not (self.detector_layer_dim % self.pool_size == 0): raise ValueError("detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" % (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size)) self.h_space = VectorSpace(self.detector_layer_dim) self.pool_layer_dim = self.detector_layer_dim / self.pool_size self.output_space = VectorSpace(self.pool_layer_dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.detector_layer_dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.detector_layer_dim)) for i in xrange(self.detector_layer_dim): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W ,= self.transformer.get_params() assert W.name is not None def get_total_state_space(self): return CompositeSpace((self.output_space, self.h_space)) def get_params(self): assert self.b.name is not None W ,= self.transformer.get_params() assert W.name is not None return self.transformer.get_params().union([self.b]) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) W ,= self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W ,= self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_view_shape(self): total = self.detector_layer_dim cols = self.pool_size if cols == 1: # Let the PatchViewer decidew how to arrange the units # when they're not pooled raise NotImplementedError() # When they are pooled, make each pooling unit have one row rows = total / cols return rows, cols def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W ,= self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): p,h = total_state self.h_space.validate(h) self.output_space.validate(p) return p def downward_state(self, total_state): p,h = total_state return h def get_monitoring_channels_from_state(self, state): P, H = state rval ={} if self.pool_size == 1: vars_and_prefixes = [ (P,'') ] else: vars_and_prefixes = [ (P, 'p_'), (H, 'h_') ] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min for key, val in [ ('max_max', v_max.max()), ('max_mean', v_max.mean()), ('max_min', v_max.min()), ('min_max', v_min.max()), ('min_mean', v_min.mean()), ('min_max', v_min.max()), ('range_max', v_range.max()), ('range_mean', v_range.mean()), ('range_min', v_range.min()), ('mean_max', v_mean.max()), ('mean_mean', v_mean.mean()), ('mean_min', v_mean.min()) ]: rval[prefix+key] = val return rval def get_l1_act_cost(self, state, target, coeff, eps = None): rval = 0. P, H = state self.output_space.validate(P) self.h_space.validate(H) if self.pool_size == 1: # If the pool size is 1 then pools = detectors # and we should not penalize pools and detectors separately assert len(state) == 2 assert isinstance(target, float) assert isinstance(coeff, float) _, state = state state = [state] target = [target] coeff = [coeff] if eps is None: eps = [0.] else: eps = [eps] else: assert all([len(elem) == 2 for elem in [state, target, coeff]]) if eps is None: eps = [0., 0.] if target[1] < target[0]: warnings.warn("Do you really want to regularize the detector units to be sparser than the pooling units?") for s, t, c, e in safe_zip(state, target, coeff, eps): assert all([isinstance(elem, float) for elem in [t, c, e]]) if c == 0.: continue m = s.mean(axis=0) assert m.ndim == 1 rval += T.maximum(abs(m-t)-e,0.).mean()*c return rval def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg, theano_rng) return p_sample, h_sample def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.h_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16)) p_exp, h_exp, p_sample, h_sample = max_pool_channels( z = default_z, pool_size = self.pool_size, theano_rng = theano_rng) assert h_sample.dtype == default_z.dtype p_state = sharedX( self.output_space.get_origin_batch( num_examples)) t2 = time.time() f = function([], updates = { p_state : p_sample, h_state : h_sample }) t3 = time.time() f() t4 = time.time() print str(self)+'.make_state took',t4-t1 print '\tcompose time:',t2-t1 print '\tcompile time:',t3-t2 print '\texecute time:',t4-t3 p_state.name = 'p_sample_shared' h_state.name = 'h_sample_shared' return p_state, h_state def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) downward_state = self.downward_state(state) self.h_space.validate(downward_state) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(downward_state, self.b) weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_'+iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' p,h = max_pool_channels(z, self.pool_size, msg) p.name = self.layer_name + '_p_' + iter_name h.name = self.layer_name + '_h_' + iter_name return p, h
class IsingHidden(HiddenLayer): """ A hidden layer with h being a vector in {-1, 1}^dim, implementing the energy function term -v^T Wh -b^T h where W and b are parameters of this layer, and v is the upward state of the layer below """ def __init__(self, dim, layer_name, irange = None, sparse_init = None, sparse_stdev = 1., include_prob = 1.0, init_bias = 0., W_lr_scale = None, b_lr_scale = None, max_col_norm = None): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX( np.zeros((self.dim,)) + init_bias, name = layer_name + '_b') def get_lr_scalers(self): if not hasattr(self, 'W_lr_scale'): self.W_lr_scale = None if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None rval = OrderedDict() if self.W_lr_scale is not None: W, = self.transformer.get_params() rval[W] = self.W_lr_scale if self.b_lr_scale is not None: rval[self.b] = self.b_lr_scale return rval def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) self.output_space = VectorSpace(self.dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.dim)) W *= self.sparse_stdev W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W ,= self.transformer.get_params() assert W.name is not None def censor_updates(self, updates): if self.max_col_norm is not None: W, = self.transformer.get_params() if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) def get_total_state_space(self): return VectorSpace(self.dim) def get_params(self): assert self.b.name is not None W ,= self.transformer.get_params() assert W.name is not None rval = self.transformer.get_params() assert not isinstance(rval, set) rval = list(rval) assert self.b not in rval rval.append(self.b) return rval def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W ,= self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W ,= self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases, recenter = False): self.b.set_value(biases) if recenter: assert self.center if self.pool_size != 1: raise NotImplementedError() self.offset.set_value(sigmoid_numpy(self.b.get_value())) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W ,= self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): return total_state def downward_state(self, total_state): return total_state def get_monitoring_channels(self): W ,= self.transformer.get_params() assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ]) def get_monitoring_channels_from_state(self, state): P = state rval = OrderedDict() vars_and_prefixes = [ (P,'') ] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min # max_x.mean_u is "the mean over *u*nits of the max over e*x*amples" # The x and u are included in the name because otherwise its hard # to remember which axis is which when reading the monitor # I use inner.outer rather than outer_of_inner or something like that # because I want mean_x.* to appear next to each other in the alphabetical # list, as these are commonly plotted together for key, val in [ ('max_x.max_u', v_max.max()), ('max_x.mean_u', v_max.mean()), ('max_x.min_u', v_max.min()), ('min_x.max_u', v_min.max()), ('min_x.mean_u', v_min.mean()), ('min_x.min_u', v_min.min()), ('range_x.max_u', v_range.max()), ('range_x.mean_u', v_range.mean()), ('range_x.min_u', v_range.min()), ('mean_x.max_u', v_mean.max()), ('mean_x.mean_u', v_mean.mean()), ('mean_x.min_u', v_mean.min()) ]: rval[prefix+key] = val return rval def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b if msg != None: z = z + msg on_prob = T.nnet.sigmoid(2. * z) samples = theano_rng.binomial(p = on_prob, n=1, size=on_prob.shape, dtype=on_prob.dtype) * 2. - 1. return samples def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def init_mf_state(self): raise NotImplementedError("This is just a copy-paste of BVMP") # work around theano bug with broadcasted vectors z = T.alloc(0., self.dbm.batch_size, self.detector_layer_dim).astype(self.b.dtype) + \ self.b.dimshuffle('x', 0) rval = max_pool_channels(z = z, pool_size = self.pool_size) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ driver = numpy_rng.uniform(0.,1., (num_examples, self.dim)) on_prob = sigmoid_numpy(2. * self.b.get_value()) sample = 2. * (driver < on_prob) - 1. rval = sharedX(sample, name = 'v_sample_shared') return rval def expected_energy_term(self, state, average, state_below, average_below): # state = Print('h_state', attrs=['min', 'max'])(state) self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.b) weights_term = (self.transformer.lmul(state_below) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def linear_feed_forward_approximation(self, state_below): """ Used to implement TorontoSparsity. Unclear exactly what properties of it are important or how to implement it for other layers. Properties it must have: output is same kind of data structure (ie, tuple of theano 2-tensors) as mf_update Properties it probably should have for other layer types: An infinitesimal change in state_below or the parameters should cause the same sign of change in the output of linear_feed_forward_approximation and in mf_update Should not have any non-linearities that cause the gradient to shrink Should disregard top-down feedback """ z = self.transformer.lmul(state_below) + self.b if self.pool_size != 1: # Should probably implement sum pooling for the non-pooled version, # but in reality it's not totally clear what the right answer is raise NotImplementedError() return z, z def mf_update(self, state_below, state_above, layer_above = None, double_weights = False, iter_name = None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError("self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x,y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_'+layer_above.layer_name+'_to_'+self.layer_name+'['+iter_name+']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_'+iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' if msg is not None: z = z + msg h = T.tanh(z) return h
class ToyRNNPhone(Model): """ WRITEME """ def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05, non_linearity='sigmoid', use_ground_truth=True): allowed_non_linearities = {'sigmoid': T.nnet.sigmoid, 'tanh': T.tanh} self.nvis = nvis self.nhid = nhid self.hidden_transition_model = hidden_transition_model self.use_ground_truth = use_ground_truth self.alpha = sharedX(1) self.alpha_decrease_rate = 0.999 assert non_linearity in allowed_non_linearities self.non_linearity = allowed_non_linearities[non_linearity] # Space initialization self.input_space = VectorSpace(dim=self.nvis) self.hidden_space = VectorSpace(dim=self.nhid) self.output_space = VectorSpace(dim=1) self.input_source = 'features' self.target_source = 'targets' # Features-to-hidden matrix W_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nvis, self.nhid)) self.W = sharedX(W_value, name='W') # Hidden biases b_value = numpy.zeros(self.nhid) self.b = sharedX(b_value, name='b') # Hidden-to-out matrix U_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nhid, 1)) self.U = sharedX(U_value, name='U') # Output bias c_value = numpy.zeros(1) self.c = sharedX(c_value, name='c') def fprop_step(self, features, h_tm1, out): h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid(T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c return h, out def fprop_step_prime(self, truth, features, h_tm1, out): features = T.set_subtensor(features[-1], (1 - self.alpha) * features[-1] + self.alpha * truth[-1]) h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid(T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c features = T.concatenate([features[1:], out]) return features, h, out def fprop(self, data): if self.use_ground_truth: self.input_space.validate(data) features = data init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda f, h, o: self.fprop_step(f, h, o) ((h, out), updates) = theano.scan(fn=fn, sequences=[features], outputs_info=[dict(initial=init_h, taps=[-1]), init_out]) return out else: self.input_space.validate(data) features = data init_in = features[0] init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o) ((f, h, out), updates) = theano.scan(fn=fn, sequences=[features], outputs_info=[init_in, dict(initial=init_h, taps=[-1]), init_out]) return out def predict_next(self, features, h_tm1): h_tm1 = self.hidden_space.format_as(h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid(T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c return h, out def get_params(self): return [self.W, self.b, self.U, self.c] + \ self.hidden_transition_model.get_params() def get_input_source(self): return self.input_source def get_target_source(self): return self.target_source def censor_updates(self, updates): updates[self.alpha] = self.alpha_decrease_rate * self.alpha def get_monitoring_channels(self, data): rval = OrderedDict() rval['alpha'] = self.alpha return rval
class ToyRNNPhone(Model): """ WRITEME """ def __init__(self, nvis, nhid, hidden_transition_model, irange=0.05, non_linearity='sigmoid', use_ground_truth=True): allowed_non_linearities = {'sigmoid': T.nnet.sigmoid, 'tanh': T.tanh} self.nvis = nvis self.nhid = nhid self.hidden_transition_model = hidden_transition_model self.use_ground_truth = use_ground_truth self.alpha = sharedX(1) self.alpha_decrease_rate = 0.999 assert non_linearity in allowed_non_linearities self.non_linearity = allowed_non_linearities[non_linearity] # Space initialization self.input_space = VectorSpace(dim=self.nvis) self.hidden_space = VectorSpace(dim=self.nhid) self.output_space = VectorSpace(dim=1) self.input_source = 'features' self.target_source = 'targets' # Features-to-hidden matrix W_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nvis, self.nhid)) self.W = sharedX(W_value, name='W') # Hidden biases b_value = numpy.zeros(self.nhid) self.b = sharedX(b_value, name='b') # Hidden-to-out matrix U_value = numpy.random.uniform(low=-irange, high=irange, size=(self.nhid, 1)) self.U = sharedX(U_value, name='U') # Output bias c_value = numpy.zeros(1) self.c = sharedX(c_value, name='c') def fprop_step(self, features, h_tm1, out): h_tm1 = self.hidden_space.format_as( h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid( T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c return h, out def fprop_step_prime(self, truth, features, h_tm1, out): features = T.set_subtensor(features[-1], (1 - self.alpha) * features[-1] + self.alpha * truth[-1]) h_tm1 = self.hidden_space.format_as( h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid( T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c features = T.concatenate([features[1:], out]) return features, h, out def fprop(self, data): if self.use_ground_truth: self.input_space.validate(data) features = data init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda f, h, o: self.fprop_step(f, h, o) ((h, out), updates) = theano.scan( fn=fn, sequences=[features], outputs_info=[dict(initial=init_h, taps=[-1]), init_out]) return out else: self.input_space.validate(data) features = data init_in = features[0] init_h = T.alloc(numpy.cast[theano.config.floatX](0), self.nhid) init_out = T.alloc(numpy.cast[theano.config.floatX](0), 1) init_out = T.unbroadcast(init_out, 0) fn = lambda t, f, h, o: self.fprop_step_prime(t, f, h, o) ((f, h, out), updates) = theano.scan(fn=fn, sequences=[features], outputs_info=[ init_in, dict(initial=init_h, taps=[-1]), init_out ]) return out def predict_next(self, features, h_tm1): h_tm1 = self.hidden_space.format_as( h_tm1.dimshuffle('x', 0), self.hidden_transition_model.input_space) h = T.nnet.sigmoid( T.dot(features, self.W) + self.hidden_transition_model.fprop(h_tm1).flatten() + self.b) out = T.dot(h, self.U) + self.c return h, out def get_params(self): return [self.W, self.b, self.U, self.c] + \ self.hidden_transition_model.get_params() def get_input_source(self): return self.input_source def get_target_source(self): return self.target_source def censor_updates(self, updates): updates[self.alpha] = self.alpha_decrease_rate * self.alpha def get_monitoring_channels(self, data): rval = OrderedDict() rval['alpha'] = self.alpha return rval
def simulate(inputs, model): space = VectorSpace(inputs.shape[1]) X = space.make_theano_batch() Y = model.fprop(space.format_as(X, model.get_input_space())) f = theano.function([X], Y) return f(inputs)
class MultiSoftmax(Layer): def __init__(self, n_groups, n_classes, layer_name, irange=None, istdev=None, sparse_init=None, W_lr_scale=None, b_lr_scale=None, max_row_norm=None, no_affine=False, max_col_norm=None): """ """ if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, py_integer_types) self.output_space = MatrixSpace(n_groups, n_classes) self.b = sharedX(np.zeros(( n_groups, n_classes, )), name='softmax_b') def get_lr_scalers(self): rval = OrderedDict() if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale if not hasattr(self, 'b_lr_scale'): self.b_lr_scale = None if self.b_lr_scale is not None: assert isinstance(self.b_lr_scale, float) rval[self.b] = self.b_lr_scale return rval def get_monitoring_channels(self): return OrderedDict() def get_monitoring_channels_from_state(self, state, target=None): return OrderedDict() def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got " + str(space) + " of type " + str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) if self.no_affine: desired_dim = self.n_classes assert self.input_dim == desired_dim else: desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.n_groups, self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim, self.n_groups, self.n_classes) * self.istdev else: raise NotImplementedError() self.W = sharedX(W, 'softmax_W') self._params = [self.b, self.W] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[ 0] != self.mlp.batch_size: raise ValueError("state_below should have batch size " + str(self.dbm.batch_size) + " but has " + str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 3 Z = T.tensordot(state_below, self.W, axes=[[1], [0]]) + self.b rval = batched_softmax(Z) for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval def cost(self, Y, Y_hat): return self.cost_from_cost_matrix(self.cost_matrix(Y, Y_hat)) def cost_from_cost_matrix(self, cost_matrix): return cost_matrix.sum(axis=2).mean() def cost_matrix(self, Y, Y_hat): return -Y * T.log(Y_hat + 0.000001) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W = self.W return coeff * abs(W).sum() def censor_updates(self, updates): return if self.max_row_norm is not None: W = self.W if W in updates: updated_W = updates[W] row_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=1)) desired_norms = T.clip(row_norms, 0, self.max_row_norm) updates[W] = updated_W * (desired_norms / (1e-7 + row_norms)).dimshuffle( 0, 'x') if self.max_col_norm is not None: assert self.max_row_norm is None W = self.W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms))
def fprop(self, state_below): vector_space = VectorSpace(self.output_space.get_total_dimension()) X = self.output_space.format_as(state_below, vector_space) rval = T.dot(X - self.mean, self.P) rval = vector_space.format_as(rval, self.output_space) return rval
class BinaryVectorMaxPool(HiddenLayer): """ A hidden layer that does max-pooling on binary vectors. It has two sublayers, the detector layer and the pooling layer. The detector layer is its downward state and the pooling layer is its upward state. TODO: this layer uses (pooled, detector) as its total state, which can be confusing when listing all the states in the network left to right. Change this and pylearn2.expr.probabilistic_max_pooling to use (detector, pooled) """ def __init__(self, detector_layer_dim, pool_size, layer_name, irange=None, sparse_init=None, include_prob=1.0, init_bias=0.): """ include_prob: probability of including a weight element in the set of weights initialized to U(-irange, irange). If not included it is initialized to 0. """ self.__dict__.update(locals()) del self.self self.b = sharedX(np.zeros((self.detector_layer_dim, )) + init_bias, name=layer_name + '_b') def set_input_space(self, space): """ Note: this resets parameters! """ self.input_space = space if isinstance(space, VectorSpace): self.requires_reformat = False self.input_dim = space.dim else: self.requires_reformat = True self.input_dim = space.get_total_dimension() self.desired_space = VectorSpace(self.input_dim) if not (self.detector_layer_dim % self.pool_size == 0): raise ValueError( "detector_layer_dim = %d, pool_size = %d. Should be divisible but remainder is %d" % (self.detector_layer_dim, self.pool_size, self.detector_layer_dim % self.pool_size)) self.h_space = VectorSpace(self.detector_layer_dim) self.pool_layer_dim = self.detector_layer_dim / self.pool_size self.output_space = VectorSpace(self.pool_layer_dim) rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange, self.irange, (self.input_dim, self.detector_layer_dim)) * \ (rng.uniform(0.,1., (self.input_dim, self.detector_layer_dim)) < self.include_prob) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.detector_layer_dim)) for i in xrange(self.detector_layer_dim): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() W = sharedX(W) W.name = self.layer_name + '_W' self.transformer = MatrixMul(W) W, = self.transformer.get_params() assert W.name is not None def get_total_state_space(self): return CompositeSpace((self.output_space, self.h_space)) def get_params(self): assert self.b.name is not None W, = self.transformer.get_params() assert W.name is not None return self.transformer.get_params().union([self.b]) def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) W, = self.transformer.get_params() return coeff * T.sqr(W).sum() def get_weights(self): if self.requires_reformat: # This is not really an unimplemented case. # We actually don't know how to format the weights # in design space. We got the data in topo space # and we don't have access to the dataset raise NotImplementedError() W, = self.transformer.get_params() return W.get_value() def set_weights(self, weights): W, = self.transformer.get_params() W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def get_weights_view_shape(self): total = self.detector_layer_dim cols = self.pool_size if cols == 1: # Let the PatchViewer decidew how to arrange the units # when they're not pooled raise NotImplementedError() # When they are pooled, make each pooling unit have one row rows = total / cols return rows, cols def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() W, = self.transformer.get_params() W = W.T W = W.reshape((self.detector_layer_dim, self.input_space.shape[0], self.input_space.shape[1], self.input_space.nchannels)) W = Conv2DSpace.convert(W, self.input_space.axes, ('b', 0, 1, 'c')) return function([], W)() def upward_state(self, total_state): p, h = total_state self.h_space.validate(h) self.output_space.validate(p) return p def downward_state(self, total_state): p, h = total_state return h def get_monitoring_channels_from_state(self, state): P, H = state rval = {} if self.pool_size == 1: vars_and_prefixes = [(P, '')] else: vars_and_prefixes = [(P, 'p_'), (H, 'h_')] for var, prefix in vars_and_prefixes: v_max = var.max(axis=0) v_min = var.min(axis=0) v_mean = var.mean(axis=0) v_range = v_max - v_min for key, val in [('max_max', v_max.max()), ('max_mean', v_max.mean()), ('max_min', v_max.min()), ('min_max', v_min.max()), ('min_mean', v_min.mean()), ('min_max', v_min.max()), ('range_max', v_range.max()), ('range_mean', v_range.mean()), ('range_min', v_range.min()), ('mean_max', v_mean.max()), ('mean_mean', v_mean.mean()), ('mean_min', v_mean.min())]: rval[prefix + key] = val return rval def get_l1_act_cost(self, state, target, coeff, eps=None): rval = 0. P, H = state self.output_space.validate(P) self.h_space.validate(H) if self.pool_size == 1: # If the pool size is 1 then pools = detectors # and we should not penalize pools and detectors separately assert len(state) == 2 assert isinstance(target, float) assert isinstance(coeff, float) _, state = state state = [state] target = [target] coeff = [coeff] if eps is None: eps = [0.] else: eps = [eps] else: assert all([len(elem) == 2 for elem in [state, target, coeff]]) if eps is None: eps = [0., 0.] if target[1] < target[0]: warnings.warn( "Do you really want to regularize the detector units to be sparser than the pooling units?" ) for s, t, c, e in safe_zip(state, target, coeff, eps): assert all([isinstance(elem, float) for elem in [t, c, e]]) if c == 0.: continue m = s.mean(axis=0) assert m.ndim == 1 rval += T.maximum(abs(m - t) - e, 0.).mean() * c return rval def sample(self, state_below=None, state_above=None, layer_above=None, theano_rng=None): if theano_rng is None: raise ValueError( "theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list." ) if state_above is not None: msg = layer_above.downward_message(state_above) else: msg = None if self.requires_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) z = self.transformer.lmul(state_below) + self.b p, h, p_sample, h_sample = max_pool_channels(z, self.pool_size, msg, theano_rng) return p_sample, h_sample def downward_message(self, downward_state): rval = self.transformer.lmul_T(downward_state) if self.requires_reformat: rval = self.desired_space.format_as(rval, self.input_space) return rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.h_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2**16)) p_exp, h_exp, p_sample, h_sample = max_pool_channels( z=default_z, pool_size=self.pool_size, theano_rng=theano_rng) assert h_sample.dtype == default_z.dtype p_state = sharedX(self.output_space.get_origin_batch(num_examples)) t2 = time.time() f = function([], updates={p_state: p_sample, h_state: h_sample}) t3 = time.time() f() t4 = time.time() print str(self) + '.make_state took', t4 - t1 print '\tcompose time:', t2 - t1 print '\tcompile time:', t3 - t2 print '\texecute time:', t4 - t3 p_state.name = 'p_sample_shared' h_state.name = 'h_sample_shared' return p_state, h_state def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) downward_state = self.downward_state(state) self.h_space.validate(downward_state) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(downward_state, self.b) weights_term = (self.transformer.lmul(state_below) * downward_state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval def mf_update(self, state_below, state_above, layer_above=None, double_weights=False, iter_name=None): self.input_space.validate(state_below) if self.requires_reformat: if not isinstance(state_below, tuple): for sb in get_debug_values(state_below): if sb.shape[0] != self.dbm.batch_size: raise ValueError( "self.dbm.batch_size is %d but got shape of %d" % (self.dbm.batch_size, sb.shape[0])) assert reduce(lambda x, y: x * y, sb.shape[1:]) == self.input_dim state_below = self.input_space.format_as(state_below, self.desired_space) if iter_name is None: iter_name = 'anon' if state_above is not None: assert layer_above is not None msg = layer_above.downward_message(state_above) msg.name = 'msg_from_' + layer_above.layer_name + '_to_' + self.layer_name + '[' + iter_name + ']' else: msg = None if double_weights: state_below = 2. * state_below state_below.name = self.layer_name + '_' + iter_name + '_2state' z = self.transformer.lmul(state_below) + self.b if self.layer_name is not None and iter_name is not None: z.name = self.layer_name + '_' + iter_name + '_z' p, h = max_pool_channels(z, self.pool_size, msg) p.name = self.layer_name + '_p_' + iter_name h.name = self.layer_name + '_h_' + iter_name return p, h
class HingeLoss(Layer): def __init__(self, n_classes, layer_name, irange = None, istdev = None, sparse_init = None): self.__dict__.update(locals()) del self.self self.output_space = VectorSpace(n_classes) self.b = sharedX(np.zeros((n_classes,)), name = 'hingeloss_b') def get_monitoring_channels(self): W = self.W assert W.ndim == 2 sq_W = T.sqr(W) row_norms = T.sqrt(sq_W.sum(axis=1)) col_norms = T.sqrt(sq_W.sum(axis=0)) return OrderedDict([ ('row_norms_min' , row_norms.min()), ('row_norms_mean' , row_norms.mean()), ('row_norms_max' , row_norms.max()), ('col_norms_min' , col_norms.min()), ('col_norms_mean' , col_norms.mean()), ('col_norms_max' , col_norms.max()), ]) def get_monitoring_channels_from_state(self, state, target=None): mx = state.max(axis=1) rval = OrderedDict([ ('mean_max_class' , mx.mean()), ('max_max_class' , mx.max()), ('min_max_class' , mx.min()) ]) if target is not None: y_hat = self.target_convert(T.argmax(state, axis=1)) #Assume target is in [0,1] as binary one-hot y = self.target_convert(T.argmax(target, axis=1)) misclass = T.neq(y, y_hat).mean() misclass = T.cast(misclass, config.floatX) rval['misclass'] = misclass rval['nll'] = self.cost(Y_hat=state, Y=target) return rval def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) desired_dim = self.input_dim self.desired_space = VectorSpace(desired_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.mlp.rng if self.irange is not None: assert self.istdev is None assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes)) elif self.istdev is not None: assert self.sparse_init is None W = rng.randn(self.input_dim, self.n_classes) * self.istdev else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'hingeloss_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def fprop(self, state_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) for value in get_debug_values(state_below): if self.mlp.batch_size is not None and value.shape[0] != self.mlp.batch_size: raise ValueError("state_below should have batch size "+str(self.dbm.batch_size)+" but has "+str(value.shape[0])) self.desired_space.validate(state_below) assert state_below.ndim == 2 assert self.W.ndim == 2 b = self.b W = self.W rval = T.dot(state_below, W) + b for value in get_debug_values(rval): if self.mlp.batch_size is not None: assert value.shape[0] == self.mlp.batch_size return rval def target_convert(self, Y): ''' converts target [0,1] to [-1, 1] ''' Y_t = 2. * Y - 1. return Y_t def hinge_cost(self, W, Y, Y_hat, C=1.): #prob = .5 * T.dot(self.W.T, self.W) + C * (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1) prob = (T.maximum(1 - Y * Y_hat, 0) ** 2.).sum(axis=1) return prob def cost(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a hinge loss estimate. of Y. """ assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert Y_hat.ndim == 2 Y_t = self.target_convert(Y) prob = self.hinge_cost(self.W, Y_t, Y_hat) assert prob.ndim == 1 rval = prob.mean() return rval def cost_matrix(self, Y, Y_hat): """ Y must be one-hot binary. Y_hat is a hinge loss estimate. of Y. """ assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert Y_hat.ndim == 2 Y_t = self.target_convert(Y) prob = self.hinge_cost(self.W, Y_t, Y_hat) return prob def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') return coeff * T.sqr(self.W).sum() def get_l1_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) or hasattr(coeff, 'dtype') W = self.W return coeff * abs(W).sum()
class Softmax(HiddenLayer): def __init__(self, n_classes, layer_name, irange = None, sparse_init = None, W_lr_scale = None): if isinstance(W_lr_scale, str): W_lr_scale = float(W_lr_scale) self.__dict__.update(locals()) del self.self assert isinstance(n_classes, int) self.output_space = VectorSpace(n_classes) self.b = sharedX( np.zeros((n_classes,)), name = 'softmax_b') def get_lr_scalers(self): rval = {} # Patch old pickle files if not hasattr(self, 'W_lr_scale'): self.W_lr_scale = None if self.W_lr_scale is not None: assert isinstance(self.W_lr_scale, float) rval[self.W] = self.W_lr_scale return rval def get_total_state_space(self): return self.output_space def get_monitoring_channels_from_state(self, state): mx = state.max(axis=1) return { 'mean_max_class' : mx.mean(), 'max_max_class' : mx.max(), 'min_max_class' : mx.min() } def set_input_space(self, space): self.input_space = space if not isinstance(space, Space): raise TypeError("Expected Space, got "+ str(space)+" of type "+str(type(space))) self.input_dim = space.get_total_dimension() self.needs_reformat = not isinstance(space, VectorSpace) self.desired_space = VectorSpace(self.input_dim) if not self.needs_reformat: assert self.desired_space == self.input_space rng = self.dbm.rng if self.irange is not None: assert self.sparse_init is None W = rng.uniform(-self.irange,self.irange, (self.input_dim,self.n_classes)) else: assert self.sparse_init is not None W = np.zeros((self.input_dim, self.n_classes)) for i in xrange(self.n_classes): for j in xrange(self.sparse_init): idx = rng.randint(0, self.input_dim) while W[idx, i] != 0.: idx = rng.randint(0, self.input_dim) W[idx, i] = rng.randn() self.W = sharedX(W, 'softmax_W' ) self._params = [ self.b, self.W ] def get_weights_topo(self): if not isinstance(self.input_space, Conv2DSpace): raise NotImplementedError() desired = self.W.get_value().T ipt = self.desired_space.format_as(desired, self.input_space) rval = Conv2DSpace.convert_numpy(ipt, self.input_space.axes, ('b', 0, 1, 'c')) return rval def get_weights(self): if not isinstance(self.input_space, VectorSpace): raise NotImplementedError() return self.W.get_value() def set_weights(self, weights): self.W.set_value(weights) def set_biases(self, biases): self.b.set_value(biases) def get_biases(self): return self.b.get_value() def get_weights_format(self): return ('v', 'h') def sample(self, state_below = None, state_above = None, layer_above = None, theano_rng = None): if state_above is not None: # If you implement this case, also add a unit test for it. # Or at least add a warning that it is not tested. raise NotImplementedError() if theano_rng is None: raise ValueError("theano_rng is required; it just defaults to None so that it may appear after layer_above / state_above in the list.") self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) z = T.dot(state_below, self.W) + self.b h_exp = T.nnet.softmax(z) h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype) return h_sample def mf_update(self, state_below, state_above = None, layer_above = None, double_weights = False, iter_name = None): if state_above is not None: raise NotImplementedError() if double_weights: raise NotImplementedError() self.input_space.validate(state_below) # patch old pickle files if not hasattr(self, 'needs_reformat'): self.needs_reformat = self.needs_reshape del self.needs_reshape if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) """ from pylearn2.utils import serial X = serial.load('/u/goodfeli/galatea/dbm/inpaint/expdir/cifar10_N3_interm_2_features.pkl') state_below = Verify(X,'features')(state_below) """ assert self.W.ndim == 2 assert state_below.ndim == 2 b = self.b Z = T.dot(state_below, self.W) + b #Z = Print('Z')(Z) rval = T.nnet.softmax(Z) return rval def downward_message(self, downward_state): rval = T.dot(downward_state, self.W.T) rval = self.desired_space.format_as(rval, self.input_space) return rval def recons_cost(self, Y, Y_hat_unmasked, drop_mask_Y, scale): """ scale is because the visible layer also goes into the cost. it uses the mean over units and examples, so that the scale of the cost doesn't change too much with batch size or example size. we need to multiply this cost by scale to make sure that it is put on the same scale as the reconstruction cost for the visible units. ie, scale should be 1/nvis """ Y_hat = Y_hat_unmasked assert hasattr(Y_hat, 'owner') owner = Y_hat.owner assert owner is not None op = owner.op if isinstance(op, Print): assert len(owner.inputs) == 1 Y_hat, = owner.inputs owner = Y_hat.owner op = owner.op assert isinstance(op, T.nnet.Softmax) z ,= owner.inputs assert z.ndim == 2 z = z - z.max(axis=1).dimshuffle(0, 'x') log_prob = z - T.exp(z).sum(axis=1).dimshuffle(0, 'x') # we use sum and not mean because this is really one variable per row log_prob_of = (Y * log_prob).sum(axis=1) masked = log_prob_of * drop_mask_Y assert masked.ndim == 1 rval = masked.mean() * scale return - rval def make_state(self, num_examples, numpy_rng): """ Returns a shared variable containing an actual state (not a mean field state) for this variable. """ t1 = time.time() empty_input = self.output_space.get_origin_batch(num_examples) h_state = sharedX(empty_input) default_z = T.zeros_like(h_state) + self.b theano_rng = MRG_RandomStreams(numpy_rng.randint(2 ** 16)) h_exp = T.nnet.softmax(default_z) h_sample = theano_rng.multinomial(pvals = h_exp, dtype = h_exp.dtype) p_state = sharedX( self.output_space.get_origin_batch( num_examples)) t2 = time.time() f = function([], updates = { h_state : h_sample }) t3 = time.time() f() t4 = time.time() print str(self)+'.make_state took',t4-t1 print '\tcompose time:',t2-t1 print '\tcompile time:',t3-t2 print '\texecute time:',t4-t3 h_state.name = 'softmax_sample_shared' return h_state def get_weight_decay(self, coeff): if isinstance(coeff, str): coeff = float(coeff) assert isinstance(coeff, float) return coeff * T.sqr(self.W).sum() def expected_energy_term(self, state, average, state_below, average_below): self.input_space.validate(state_below) if self.needs_reformat: state_below = self.input_space.format_as(state_below, self.desired_space) self.desired_space.validate(state_below) # Energy function is linear so it doesn't matter if we're averaging or not # Specifically, our terms are -u^T W d - b^T d where u is the upward state of layer below # and d is the downward state of this layer bias_term = T.dot(state, self.b) weights_term = (T.dot(state_below, self.W) * state).sum(axis=1) rval = -bias_term - weights_term assert rval.ndim == 1 return rval